Minor adjustments

95af51fb · Konstantin Julius Lotzgeselle · 7aaa9046 · 95af51fb · 95af51fb
Commit 95af51fb authored Jan 6, 2024 by Konstantin Julius Lotzgeselle
--- a/exploration.ipynb
+++ b/exploration.ipynb
@@ -20,6 +20,10 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "f7c39c06ce3a14db",
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [],
   "source": [
    "# split the data into training/dev/test\n",
@@ -33,25 +37,25 @@
    "for id in idx:\n",
    "    print(id.item())\n",
    "    print(\"ENG: \" + data_array_en[id.item()] + \"DEU: \" + data_array_de[id.item()])"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "f7c39c06ce3a14db"
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## 1. Text tokenization"
-   ],
+   "id": "f2beddcc4122495a",
   "metadata": {
    "collapsed": false
   },
-   "id": "f2beddcc4122495a"
+   "source": [
+    "## 1. Text tokenization"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "d8ccbafa97fba573",
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [],
   "source": [
    "# set up the tokenizer\n",
@@ -74,15 +78,15 @@
    "# set up the pre-tokenizer -> this ensures, that the maximal token length is one word\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "tokenizer.pre_tokenizer = Whitespace()"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "d8ccbafa97fba573"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "55cbac65a50a0199",
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [],
   "source": [
    "tokenizer.train(['./data/training-data/dev/newstest2013.en'], trainer)\n",
@@ -96,15 +100,15 @@
    "        (\"[SEP]\", tokenizer.token_to_id(\"[SEP]\")),\n",
    "    ],\n",
    ")"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "55cbac65a50a0199"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "569b9a3425aa5800",
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [],
   "source": [
    "print(tokenizer.get_vocab_size())\n",
@@ -120,25 +124,25 @@
    "\n",
    "print(test_de.tokens)\n",
    "#print(test_de.ids)"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "569b9a3425aa5800"
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## 2. Prepare the training data"
-   ],
+   "id": "9c0f853775a802ec",
   "metadata": {
    "collapsed": false
   },
-   "id": "9c0f853775a802ec"
+   "source": [
+    "## 2. Prepare the training data"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "2e4dc87ce98b6cdd",
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [],
   "source": [
    "# Prepare training batch\n",
@@ -164,25 +168,29 @@
    "    return torch.tensor(x_training_data), torch.tensor(y_training_data)\n",
    "\n",
    "print(training_data())"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "2e4dc87ce98b6cdd"
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## 3. Build the sequence2sequence RNN"
-   ],
+   "id": "689e2e565cce2845",
   "metadata": {
    "collapsed": false
   },
-   "id": "689e2e565cce2845"
+   "source": [
+    "## 3. Build the sequence2sequence RNN"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
+   "id": "e8d99510479108f4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-01-06T14:20:39.964393Z",
+     "start_time": "2024-01-06T14:20:39.859015Z"
+    },
+    "collapsed": false
+   },
   "outputs": [
    {
     "name": "stdout",
@@ -708,25 +716,17 @@
    "\n",
    "train = training_data()\n",
    "print(model(train[0]))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-01-06T14:20:39.964393Z",
-     "start_time": "2024-01-06T14:20:39.859015Z"
-    }
-   },
-   "id": "e8d99510479108f4"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "outputs": [],
-   "source": [],
+   "id": "1f8d3152359f6658",
   "metadata": {
    "collapsed": false
   },
-   "id": "1f8d3152359f6658"
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
@@ -745,7 +745,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

--- a/prepare_data.ipynb
+++ b/prepare_data.ipynb