Some enhancements to the model

62b7eb17 · marvnsch · 8f7ee777 · 62b7eb17
Commit 62b7eb17 authored Jan 6, 2024 by marvnsch
--- a/exploration.ipynb
+++ b/exploration.ipynb
@@ -2,13 +2,13 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
-     "end_time": "2024-01-06T14:49:02.395774Z",
-     "start_time": "2024-01-06T14:49:02.392507Z"
+     "end_time": "2024-01-06T16:13:05.680020Z",
+     "start_time": "2024-01-06T16:13:04.899176Z"
    }
   },
   "outputs": [],
@@ -21,46 +21,94 @@
    "from pathlib import Path"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 0. Prepare the data"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "d8d7f32150682efd"
+  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1163\n",
-      "ENG: JPMorgan: recommends to weigh carefully both cases for \"similar threats\" (saturated market and worse economic climate).\n",
-      "DEU: JPMorgan: empfiehlt angesichts der ähnlichen Bedrohungen (gesättigter Markt und schlechteres Wirtschaftsklima) die Erwartungen in beiden Fällen zurückzusetzen\n",
+      "1046809 1046809\n",
+      "Zeile: 993209\n",
+      "Deutsch: Aber es muß auch darum gehen, Anreize zu schaffen für einen umweltfreundlichen lokalen öffentlichen Nahverkehr.\n",
+      "English: But it is also necessary to create incentives for environmentally friendly local public transport.\n",
+      "\n",
+      "Zeile: 459853\n",
+      "Deutsch: Vielleicht sollte er dramatisch verlangsamt werden?\n",
+      "English: Perhaps it should be slowed down dramatically?\n",
+      "\n",
+      "Zeile: 605086\n",
+      "Deutsch: Die Prämien haben im Übrigen durchaus positive grenzüberschreitende Wirkungen.\n",
+      "English: The incentives have also had a positive cross-border impact.\n",
      "\n",
-      "1816\n",
-      "ENG: The charge that she concentrated too much on foreign affairs, she dismissed with a terribly presumptuous statement.\n",
-      "DEU: Den Vorwurf, dass sie sich zu sehr auf die Außenpolitik konzentriere, hat Angela Merkel mit einem arg überheblichen Satz zurückgewiesen.\n",
      "\n",
-      "1846\n",
-      "ENG: One day after resigning as army chief, Pakistani ruler Musharraf was sworn in as president.\n",
-      "DEU: Einen Tag nach seinem Rücktritt als Armeechef ist der pakistanische Machthaber Musharraf als Präsident vereidigt worden.\n"
+      "Number of lines: (1046809, 1046809)\n"
     ]
    }
   ],
   "source": [
-    "# split the data into training/dev/test\n",
-    "data_array_en = open(Path('./data/training-data/dev/news-test2008.en'), 'r').readlines()\n",
-    "data_array_de = open(Path('./data/training-data/dev/news-test2008.de'), 'r').readlines()\n",
+    "def load_data() -> list[str]:\n",
+    "    with open(\"data/training-data/eup/europarl-v7.de-en.de\", \"r\", encoding=\"utf8\") as f:\n",
+    "        data_de = [line.rstrip(\"\\n\") for line in f]\n",
+    "    with open(\"data/training-data/eup/europarl-v7.de-en.en\", \"r\", encoding=\"utf8\") as f:\n",
+    "        data_en = [line.rstrip(\"\\n\") for line in f]\n",
    "\n",
-    "data_en = open(Path('./data/training-data/dev/news-test2008.en'), 'r').read()\n",
+    "    ltd = set() # save lines to delete later\n",
    "\n",
-    "idx = torch.randint(low=0, high=2000, size=(3, ))\n",
+    "    for i in range(max(len(data_de), len(data_en))):\n",
+    "        # Move sentence to next line if line is empty other file\n",
+    "        if data_de[i] == \"\":\n",
+    "            data_en[i+1] = data_en[i] + \" \" + data_en[i+1]\n",
+    "            ltd.add(i)\n",
+    "        if data_en[i] == \"\":\n",
+    "            data_de[i+1] = data_de[i] + \" \" + data_de[i+1]\n",
+    "            ltd.add(i)\n",
    "\n",
-    "for id in idx:\n",
-    "    print(id.item())\n",
-    "    print(\"ENG: \" + data_array_en[id.item()] + \"DEU: \" + data_array_de[id.item()])"
+    "        # Remove lines, where difference in words is > 40%\n",
+    "        if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:\n",
+    "            ltd.add(i)\n",
+    "\n",
+    "        # Remove lines < 3 words or > 25 words\n",
+    "        if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 25:\n",
+    "            ltd.add(i)\n",
+    "\n",
+    "    temp_de = [l for i, l in enumerate(data_de) if i not in ltd]\n",
+    "    data_de = temp_de\n",
+    "    temp_en = [l for i, l in enumerate(data_en) if i not in ltd]\n",
+    "    data_en = temp_en\n",
+    "    print(len(data_de),len(data_en))\n",
+    "\n",
+    "    # Print 3 random sentence pairs\n",
+    "    ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))\n",
+    "    for i in ix:\n",
+    "        print(f\"Zeile: {i}\\nDeutsch: {data_de[i]}\\nEnglish: {data_en[i]}\\n\")\n",
+    "\n",
+    "    print(f\"\\nNumber of lines: {len(data_de), len(data_en)}\")\n",
+    "\n",
+    "    return data_de, data_en\n",
+    "\n",
+    "def count_words(string: str) -> int:\n",
+    "    return len(string.split())\n",
+    "\n",
+    "\n",
+    "source, target = load_data()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T14:36:38.353958Z",
-     "start_time": "2024-01-06T14:36:38.325614Z"
+     "end_time": "2024-01-06T16:13:21.693225Z",
+     "start_time": "2024-01-06T16:13:07.747200Z"
    }
   },
   "id": "f7c39c06ce3a14db"
@@ -77,7 +125,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 3,
   "outputs": [],
   "source": [
    "# set up the tokenizer\n",
@@ -95,7 +143,7 @@
    "#   [SEP] : separator for chaining multiple sentences\n",
    "#   [PAD] : padding needed for encoder input\n",
    "#   [MASK] : bad words!?\n",
-    "trainer = BpeTrainer(special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n",
+    "trainer = BpeTrainer(vocab_size=50000, special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n",
    "\n",
    "# set up the pre-tokenizer -> this ensures, that the maximal token length is one word\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
@@ -104,15 +152,15 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T14:36:38.355028Z",
-     "start_time": "2024-01-06T14:36:38.336967Z"
+     "end_time": "2024-01-06T16:13:29.089807Z",
+     "start_time": "2024-01-06T16:13:29.030579Z"
    }
   },
   "id": "d8ccbafa97fba573"
  },
  {
   "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
@@ -121,7 +169,7 @@
    }
   ],
   "source": [
-    "tokenizer.train(['./data/training-data/dev/newstest2013.en'], trainer)\n",
+    "tokenizer.train([\"data/training-data/eup/europarl-v7.de-en.de\", \"data/training-data/eup/europarl-v7.de-en.en\"], trainer)\n",
    "\n",
    "# configure post processing\n",
    "tokenizer.post_processor = TemplateProcessing(\n",
@@ -138,49 +186,12 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T15:20:24.010459Z",
-     "start_time": "2024-01-06T15:20:23.741649Z"
+     "end_time": "2024-01-06T16:13:44.185677Z",
+     "start_time": "2024-01-06T16:13:31.516882Z"
    }
   },
   "id": "55cbac65a50a0199"
  },
-  {
-   "cell_type": "code",
-   "execution_count": 132,
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Government crisis coming, says Gallup\n",
-      "\n",
-      "['[CLS]', 'They', 'also', 'predict', 'that', 'the', 'ECB', 'will', 'cut', 'interest', 'rates', 'twice', 'during', 'the', 'course', 'of', '2008', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']\n",
-      "['[CLS]', 'D', 'er', 'Welt', 'mark', 't', 'pre', 'is', 'f', 'ü', 'r', 'Ro', 'h', '[UNK]', 'l', 'st', 'ie', 'g', 'in', 'dies', 'em', 'J', 'ah', 'r', 'um', '52', 'Pro', 'z', 'ent', '-', 'im', 'ver', 'gang', 'en', 'en', 'Mon', 'at', 'er', 're', 'ich', 'te', 'der', 'Pre', 'is', 'pro', 'F', 'ass', 'des', 'sch', 'war', 'zen', 'G', 'old', 'es', 'na', 'he', 'z', 'u', '100', 'US', 'Dol', 'lar', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(data_array_en[15])\n",
-    "\n",
-    "# testing the trained tokenizer\n",
-    "test_en = tokenizer.encode(data_array_en[14])\n",
-    "test_de = tokenizer.encode(data_array_de[11])\n",
-    "\n",
-    "print(test_en.tokens)\n",
-    "#print(test_en.ids)\n",
-    "\n",
-    "print(test_de.tokens)\n",
-    "#print(test_de.ids)"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-01-06T15:31:35.185390Z",
-     "start_time": "2024-01-06T15:31:35.181290Z"
-    }
-   },
-   "id": "569b9a3425aa5800"
-  },
  {
   "cell_type": "markdown",
   "source": [
@@ -193,82 +204,105 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(tensor([[    1,   388,  1282,  3117,  9643,   707,   186,  4944,   430,    16,\n",
+      "(tensor([[    1,   556,  9472,   344,   386,   346,   984,   362,   472,    18,\n",
+      "             2,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   502,   565,  8649,  9315,   401,   346,  1625,  1566, 16999,\n",
+      "          2138,   401, 10036,   346,    72,    17, 14392,  1849,   363, 15202,\n",
+      "         17748,   335,   344,    18,     2,     3,     3],\n",
+      "        [    1,   721,   342,   963,   335,   459,  1522,   360,  3311,   363,\n",
+      "          1008, 13528, 25339,    18,     2,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,  2657,   440,  4595,  3001, 20519,   362,   346,  3019,  1561,\n",
+      "         39484, 12799,   335,  3897,    17, 13314,   472,    18,     2,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1, 10169,   525,   359,   632,   360, 15202,    68,  4380,   375,\n",
+      "         37324,   477, 19158,   936,   914,   342, 13922,   360,   359,   346,\n",
+      "          1445,  1403,   335,  2624,    18,     2,     3],\n",
+      "        [    1, 14207,    16,   346,  4097,   362,   346,  5604,  4193, 10550,\n",
+      "           338,  6024,  6130,  1009,   363,   341, 22069,  4488, 29208,    18,\n",
+      "             2,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   721,   546,  2619,  1048,   401, 21666,  7450,   360,   359,\n",
+      "          6691,   360,  4356,  4897,  8267,    18,     2,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   721,   342,   470,  3184,   401,   990,  1478,   668,   359,\n",
+      "         21775,  9083, 37514,   362,   346,  8748,   401,   882,  7700,    18,\n",
+      "             2,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   721,   342,  4702,  4371,   401,   380,   486,  6034,  7872,\n",
+      "           363,   401,   380,   659, 14235,  2583,   335,  1141,   360,  1258,\n",
+      "          1313,   882,   486, 39184,   682,    18,     2],\n",
+      "        [    1, 19047,    16,   941,   987,    16,   882,   440,   360,   359,\n",
+      "           764,  1251,   401,  5140,     2,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3]]), tensor([[    1,   596,   578,   339,   435,   367,   956,  2403,  7978,    18,\n",
      "             2,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,  3305,   134,    98,   899,   773,  2115,    14,    38,    88,\n",
-      "           946,    37,   251,   106,   218,   271,   142,  5847,  1114, 11655,\n",
-      "           101,  1330,    15,  1559,   895,  7792,  2615,   230,   233,   101,\n",
-      "         14198,    16,   149,  6664,  1854,  3660,  6696,   101, 14217,    14,\n",
-      "           406,    98,  5495,   676,   301,    98,  2203,    16,     2],\n",
-      "        [    1,    25,    22,  1468,   109,   268,   489,   111,  2757,   107,\n",
-      "         14328,  3592,    14,  5978,  1468,   111,  4932,   189,   232,    15,\n",
-      "          1002,   742,   134,  5181, 10922,   118,    26,    22,  1468,   489,\n",
-      "           111,   233,   241,   427,   249,  3620,  3707,    16,     2,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,     6,    40,   185,  1000,    81,   284,   111,   837,   142,\n",
-      "            98,   394,    10,    78,  3276,   218,  2198,    14,    98,  3269,\n",
-      "           407,    98,  1078,   393,    38,   150,   752,   218,  2198,    16,\n",
-      "             2,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,  1188,   469,    97,  4975,  2153,    98,  1257,   109,    60,\n",
-      "          4919,   137,    99,   231,   117,   115,   109,  5030,    16,     2,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   582,   619,   415, 12866,  2317,  5209,    16,   475,   352,\n",
+      "          8180,  3296,  4473,  8201,    16,   367,   367,  1755,   510,   372,\n",
+      "         12175, 24927, 10069,   369,  3211,  9613, 17730,  3142,    18,     2,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   467,   848,  2154,  4491,   352,  2016,   369,   367,  3126,\n",
+      "           442,  6647,  8666, 23685,   448,  2812,  2171,    18,     2,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,    52,    42,  1872,  4697,   116,   940,  9446,  9590,  6492,\n",
-      "             2,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   720,   586,  7415, 17167,   352,  1744,    16,   549,   618,\n",
+      "           628,   381,   729, 17335,  1014,  4289,  3610,   852, 12465,  3018,\n",
+      "            18,     2,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   720,   570,  1359,   636,  6085, 27733,  1771,   784,   471,\n",
+      "         11607,   510,   367, 28935,   406,  9854,  1962,   335,  2624, 20901,\n",
+      "           834,    18,     2,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,  5097,  6837,   367, 23413, 10643,   501,   442,   352,  7203,\n",
+      "         18377,   369,   442, 35812,  3579, 12270, 34707,   396,    18,     2,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   802,   415,  1869,  5362,    16,   655,   352,  7618,   729,\n",
+      "           549,   504,   474,   335,   352,  2014,   415,    16, 13502,  3055,\n",
+      "         19139,    18,     2,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   802,   786,   474,  7207,    16,   475,  1208,  2829, 17233,\n",
+      "           358, 22631,   500,    18,     2,     3,     3,     3,     3,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,    45,    39,    50, 10515,   136,  1739,   111,  2394,   221,\n",
-      "           363,    15,  1140,   200, 11477,   101, 10716,  2777,   111,  3524,\n",
-      "           239,   199,    98,  3154,   651,   430,    14,  3315,   118,   891,\n",
-      "          1441,   111,   269,    97,    98,    52,    42,    16,     2,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,   337,   753,  1036,  8452,    80,   166,  2502,   111,  9658,\n",
-      "           341,  2297,    98,   995,   142,   760,    98,    33,   638,   185,\n",
-      "           186, 12033,  1063,    14,   122,  2571,   658,   173,  5604,  2395,\n",
-      "            97,    98,  1857,   109,  8512,  1080,   118,    98,   135,  2512,\n",
-      "            16,     2,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,   337,   347,    98,  1232,   104,  3311,   118,    98,  4241,\n",
-      "           104,   972,   210,   971,    16,     2,     3,     3,     3,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,   802,   415,  6441, 12248,    16,   475,   416,   448,  7130,\n",
+      "          9857,  4790,   369,   475,   416,  5613,  2122,  5719,   834,    16,\n",
+      "           427,   383,  1639,    16, 25574,   628, 12850,    18,     2,     3,\n",
      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
-      "        [    1,   453,  1815, 11324,   668,   103,  3186,  3454,  1664,   405,\n",
-      "           116,  1093,  5548,    14,   187,  7502,  1710,  3607,   209,   173,\n",
-      "          2413,   157,   147,  1955,    60,  1016,    97, 11474,   192,    10,\n",
-      "          3815,  1458,  1171,    16,     2,     3,     3,     3,     3,     3,\n",
-      "             3,     3,     3,     3,     3,     3,     3,     3,     3]]), tensor([[   1, 6182,   63,  ...,    3,    3,    3],\n",
-      "        [   1,   54,  281,  ...,    3,    3,    3],\n",
-      "        [   1,   25,   22,  ...,    3,    3,    3],\n",
-      "        ...,\n",
-      "        [   1,   35,  211,  ...,    3,    3,    3],\n",
-      "        [   1,   35,  211,  ...,    3,    3,    3],\n",
-      "        [   1, 8389,   42,  ...,    3,    3,    3]]))\n"
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3],\n",
+      "        [    1,  2475,   642,   339,   981,  3171,   916,   824,   500,  5140,\n",
+      "             2,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,\n",
+      "             3,     3,     3,     3,     3,     3,     3,     3,     3,     3]]))\n"
     ]
    }
   ],
   "source": [
    "# Prepare training batch\n",
-    "def training_data(batch_size: int = 10, max_tokens: int = 200) -> tuple[torch.tensor, torch.tensor]:\n",
+    "def training_data(batch_size: int = 10, max_tokens: int = 50) -> tuple[torch.tensor, torch.tensor]:\n",
    "    x_training_data = []\n",
    "    y_training_data = []\n",
    "\n",
    "    # select random sentences\n",
-    "    batch_indices = torch.randint(0, len(data_array_en), (batch_size, ))\n",
+    "    batch_indices = torch.randint(0, len(source), (batch_size, ))\n",
    "    for idx in batch_indices:\n",
-    "        x_training_data.append(data_array_en[idx])\n",
-    "        y_training_data.append(data_array_de[idx])\n",
+    "        x_training_data.append(target[idx])\n",
+    "        y_training_data.append(source[idx])\n",
    "\n",
    "    # tokenize data\n",
    "    tokenizer.enable_padding(pad_id=3)\n",
@@ -294,8 +328,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T15:26:50.243017Z",
-     "start_time": "2024-01-06T15:26:50.228615Z"
+     "end_time": "2024-01-06T16:15:05.880850Z",
+     "start_time": "2024-01-06T16:15:05.875967Z"
    }
   },
   "id": "2e4dc87ce98b6cdd"
@@ -312,7 +346,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 11,
   "outputs": [],
   "source": [
    "embedding_dimension = 100\n",
@@ -334,15 +368,14 @@
    "                                    bidirectional=bidirectional,\n",
    "                                    batch_first=True)\n",
    "\n",
+    "        self._dropout = torch.nn.Dropout(0.1)\n",
+    "        \n",
    "        \n",
    "        \n",
    "    def forward(self, embedded_sequence: torch.Tensor):\n",
    "        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #hidden state WITH batches\n",
    "        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #internal state WITH batches\n",
    "        \n",
-    "        #h_0 = torch.zeros(self._num_layers, self._hidden_size) #hidden state WITHOUT batches\n",
-    "        #c_0 = torch.zeros(self._num_layers, self._hidden_size) #internal state WITHOUT batches\n",
-    "        \n",
    "        output, (hn, cn) = self._lstm(embedded_sequence, (h_0, c_0))\n",
    "        \n",
    "        return output, hn, cn\n",
@@ -351,13 +384,12 @@
    "class Decoder(torch.nn.Module):\n",
    "    def __init__(self, input_size: int, hidden_size: int, output_size: int,\n",
    "                 num_layers: int = 1, bidirectional: bool = False,\n",
-    "                 max_tokens: int = 40, batch_size: int = 10):\n",
+    "                 max_tokens: int = 40):\n",
    "        super(Decoder, self).__init__()\n",
    "        \n",
    "        self._hidden_size = hidden_size\n",
    "        self._num_layers = num_layers\n",
    "        self._max_tokens = max_tokens\n",
-    "        self._batch_size = batch_size\n",
    "        \n",
    "        # embedding matrix\n",
    "        self._embedding = embedding_matrix_dec\n",
@@ -373,12 +405,13 @@
    "        self._out = nn.Linear(hidden_size, output_size)\n",
    "        \n",
    "    def forward(self, x):\n",
+    "        batch_size = x[0].size(0)\n",
    "        hidden_state = x[1]\n",
    "        cell_state = x[2]\n",
    "        outputs = []\n",
    "        \n",
    "        # prepare start token\n",
-    "        x_in = torch.empty(self._batch_size, 1, dtype=torch.long).fill_(1)\n",
+    "        x_in = torch.empty(batch_size, 1, dtype=torch.long).fill_(1)\n",
    "        \n",
    "        for i in range(self._max_tokens):\n",
    "            out, hidden_state, cell_state  = self.forward_step(x_in, hidden_state, cell_state)\n",
@@ -402,8 +435,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T15:31:53.769409Z",
-     "start_time": "2024-01-06T15:31:53.727896Z"
+     "end_time": "2024-01-06T16:15:23.365173Z",
+     "start_time": "2024-01-06T16:15:23.251387Z"
    }
   },
   "id": "e8d99510479108f4"
@@ -420,40 +453,54 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 17,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---- Iteration 10 ----\n",
-      "loss: 2.325965642929077\n",
+      "loss: 9.291775703430176\n",
      "---- Iteration 20 ----\n",
-      "loss: 2.1237902641296387\n",
+      "loss: 6.553857803344727\n",
      "---- Iteration 30 ----\n",
-      "loss: 2.099151849746704\n",
+      "loss: 4.213151454925537\n",
      "---- Iteration 40 ----\n",
-      "loss: 1.8893781900405884\n",
+      "loss: 3.1044561862945557\n",
      "---- Iteration 50 ----\n",
-      "loss: 2.0475175380706787\n"
+      "loss: 3.47859263420105\n",
+      "---- Iteration 60 ----\n",
+      "loss: 3.166140079498291\n",
+      "---- Iteration 70 ----\n",
+      "loss: 3.3509914875030518\n",
+      "---- Iteration 80 ----\n",
+      "loss: 2.626647710800171\n",
+      "---- Iteration 90 ----\n",
+      "loss: 3.137316942214966\n",
+      "---- Iteration 100 ----\n",
+      "loss: 3.088139295578003\n",
+      "---- Iteration 110 ----\n",
+      "loss: 2.9085235595703125\n",
+      "---- Iteration 120 ----\n",
+      "loss: 2.8475253582000732\n"
     ]
    },
    {
     "ename": "ValueError",
-     "evalue": "expected sequence of length 200 at dim 1 (got 201)",
+     "evalue": "expected sequence of length 50 at dim 1 (got 62)",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[136], line 20\u001B[0m\n\u001B[1;32m     17\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[1;32m     19\u001B[0m \u001B[38;5;66;03m# make prediction\u001B[39;00m\n\u001B[0;32m---> 20\u001B[0m x_train, y_train \u001B[38;5;241m=\u001B[39m \u001B[43mtraining_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmax_tokens\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mmax_tokens_per_sequence\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     21\u001B[0m predict \u001B[38;5;241m=\u001B[39m model(x_train)[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m     23\u001B[0m \u001B[38;5;66;03m# match dimensions of prediction & gold_label vector\u001B[39;00m\n",
-      "Cell \u001B[0;32mIn[128], line 28\u001B[0m, in \u001B[0;36mtraining_data\u001B[0;34m(batch_size, max_tokens)\u001B[0m\n\u001B[1;32m     24\u001B[0m x_training_data \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mtensor(x_training_data)\n\u001B[1;32m     26\u001B[0m \u001B[38;5;66;03m# 'tensorfy' & one hot encode y data\u001B[39;00m\n\u001B[1;32m     27\u001B[0m \u001B[38;5;66;03m#y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)\u001B[39;00m\n\u001B[0;32m---> 28\u001B[0m y_training_data \u001B[38;5;241m=\u001B[39m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtensor\u001B[49m\u001B[43m(\u001B[49m\u001B[43my_training_data\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     29\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m x_training_data, y_training_data\n",
-      "\u001B[0;31mValueError\u001B[0m: expected sequence of length 200 at dim 1 (got 201)"
+      "Cell \u001B[0;32mIn[17], line 20\u001B[0m\n\u001B[1;32m     17\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[1;32m     19\u001B[0m \u001B[38;5;66;03m# make prediction\u001B[39;00m\n\u001B[0;32m---> 20\u001B[0m x_train, y_train \u001B[38;5;241m=\u001B[39m \u001B[43mtraining_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbatch_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m32\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmax_tokens\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mmax_tokens_per_sequence\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     21\u001B[0m predict \u001B[38;5;241m=\u001B[39m model(x_train)[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m     23\u001B[0m \u001B[38;5;66;03m# match dimensions of prediction & gold_label vector\u001B[39;00m\n",
+      "Cell \u001B[0;32mIn[8], line 28\u001B[0m, in \u001B[0;36mtraining_data\u001B[0;34m(batch_size, max_tokens)\u001B[0m\n\u001B[1;32m     24\u001B[0m x_training_data \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mtensor(x_training_data)\n\u001B[1;32m     26\u001B[0m \u001B[38;5;66;03m# 'tensorfy' & one hot encode y data\u001B[39;00m\n\u001B[1;32m     27\u001B[0m \u001B[38;5;66;03m#y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)\u001B[39;00m\n\u001B[0;32m---> 28\u001B[0m y_training_data \u001B[38;5;241m=\u001B[39m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtensor\u001B[49m\u001B[43m(\u001B[49m\u001B[43my_training_data\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     29\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m x_training_data, y_training_data\n",
+      "\u001B[0;31mValueError\u001B[0m: expected sequence of length 50 at dim 1 (got 62)"
     ]
    }
   ],
   "source": [
-    "LSTM_hidden_size = 100\n",
-    "max_tokens_per_sequence = 200\n",
+    "LSTM_hidden_size = 128\n",
+    "max_tokens_per_sequence = 70\n",
    "\n",
    "model = nn.Sequential(\n",
    "    embedding_matrix_enc,\n",
@@ -462,8 +509,8 @@
    "            output_size=vocab_size, max_tokens=max_tokens_per_sequence)\n",
    ")\n",
    "\n",
-    "num_epochs = 100\n",
-    "optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)\n",
+    "num_epochs = 1000\n",
+    "optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)\n",
    "loss_function = torch.nn.NLLLoss()\n",
    "\n",
    "for i in range(1, num_epochs + 1):\n",
@@ -471,7 +518,7 @@
    "    optimizer.zero_grad()\n",
    "\n",
    "    # make prediction\n",
-    "    x_train, y_train = training_data(max_tokens=max_tokens_per_sequence)\n",
+    "    x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)\n",
    "    predict = model(x_train)[0]\n",
    "\n",
    "    # match dimensions of prediction & gold_label vector\n",
@@ -490,8 +537,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T15:32:48.338002Z",
-     "start_time": "2024-01-06T15:32:15.962188Z"
+     "end_time": "2024-01-06T16:19:42.036848Z",
+     "start_time": "2024-01-06T16:18:33.291526Z"
    }
   },
   "id": "1f8d3152359f6658"
@@ -508,11 +555,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [],
+   "execution_count": 16,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "'Ich ist der der der'"
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_sequence =  (\"Ist dies der Weg, oder nicht?.\")\n",
+    "\n",
+    "test_sequence_enc = tokenizer.encode(test_sequence)\n",
+    "\n",
+    "print(test_sequence_enc.ids)\n",
+    "\n",
+    "test_sequence_batched = torch.tensor(test_sequence_enc.ids).view(1, -1)\n",
+    "\n",
+    "predict, _, _ = model(test_sequence_batched)\n",
+    "_, topi = predict.topk(1)\n",
+    "decoded_ids = topi.squeeze()\n",
+    "tokenizer.decode(list(decoded_ids))"
+   ],
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-06T16:17:11.259876Z",
+     "start_time": "2024-01-06T16:17:11.211111Z"
+    }
   },
   "id": "b95fb365f686125d"
  }

 %% Cell type:code id:initial_id tags:

 ``` python
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import seaborn as sns
 from matplotlib import pyplot as plt
 from pathlib import Path
 ```

+%% Cell type:markdown id:d8d7f32150682efd tags:
+
+## 0. Prepare the data
+
 %% Cell type:code id:f7c39c06ce3a14db tags:

 ``` python
-# split the data into training/dev/test
-data_array_en = open(Path('./data/training-data/dev/news-test2008.en'), 'r').readlines()
-data_array_de = open(Path('./data/training-data/dev/news-test2008.de'), 'r').readlines()
+def load_data() -> list[str]:
+    with open("data/training-data/eup/europarl-v7.de-en.de", "r", encoding="utf8") as f:
+        data_de = [line.rstrip("\n") for line in f]
+    with open("data/training-data/eup/europarl-v7.de-en.en", "r", encoding="utf8") as f:
+        data_en = [line.rstrip("\n") for line in f]
+
+    ltd = set() # save lines to delete later
+
+    for i in range(max(len(data_de), len(data_en))):
+        # Move sentence to next line if line is empty other file
+        if data_de[i] == "":
+            data_en[i+1] = data_en[i] + " " + data_en[i+1]
+            ltd.add(i)
+        if data_en[i] == "":
+            data_de[i+1] = data_de[i] + " " + data_de[i+1]
+            ltd.add(i)
+
+        # Remove lines, where difference in words is > 40%
+        if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:
+            ltd.add(i)
+
+        # Remove lines < 3 words or > 25 words
+        if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 25:
+            ltd.add(i)
+
+    temp_de = [l for i, l in enumerate(data_de) if i not in ltd]
+    data_de = temp_de
+    temp_en = [l for i, l in enumerate(data_en) if i not in ltd]
+    data_en = temp_en
+    print(len(data_de),len(data_en))
+
+    # Print 3 random sentence pairs
+    ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))
+    for i in ix:
+        print(f"Zeile: {i}\nDeutsch: {data_de[i]}\nEnglish: {data_en[i]}\n")
+
+    print(f"\nNumber of lines: {len(data_de), len(data_en)}")
+
+    return data_de, data_en

-data_en = open(Path('./data/training-data/dev/news-test2008.en'), 'r').read()
+def count_words(string: str) -> int:
+    return len(string.split())

-idx = torch.randint(low=0, high=2000, size=(3, ))

-for id in idx:
-    print(id.item())
-    print("ENG: " + data_array_en[id.item()] + "DEU: " + data_array_de[id.item()])
+source, target = load_data()
 ```

 %% Output

-    1163
-    ENG: JPMorgan: recommends to weigh carefully both cases for "similar threats" (saturated market and worse economic climate).
-    DEU: JPMorgan: empfiehlt angesichts der ähnlichen Bedrohungen (gesättigter Markt und schlechteres Wirtschaftsklima) die Erwartungen in beiden Fällen zurückzusetzen
+    1046809 1046809
+    Zeile: 993209
+    Deutsch: Aber es muß auch darum gehen, Anreize zu schaffen für einen umweltfreundlichen lokalen öffentlichen Nahverkehr.
+    English: But it is also necessary to create incentives for environmentally friendly local public transport.
+    
+    Zeile: 459853
+    Deutsch: Vielleicht sollte er dramatisch verlangsamt werden?
+    English: Perhaps it should be slowed down dramatically?
+    
+    Zeile: 605086
+    Deutsch: Die Prämien haben im Übrigen durchaus positive grenzüberschreitende Wirkungen.
+    English: The incentives have also had a positive cross-border impact.
    
-    1816
-    ENG: The charge that she concentrated too much on foreign affairs, she dismissed with a terribly presumptuous statement.
-    DEU: Den Vorwurf, dass sie sich zu sehr auf die Außenpolitik konzentriere, hat Angela Merkel mit einem arg überheblichen Satz zurückgewiesen.
    
-    1846
-    ENG: One day after resigning as army chief, Pakistani ruler Musharraf was sworn in as president.
-    DEU: Einen Tag nach seinem Rücktritt als Armeechef ist der pakistanische Machthaber Musharraf als Präsident vereidigt worden.
+    Number of lines: (1046809, 1046809)

 %% Cell type:markdown id:f2beddcc4122495a tags:

 ## 1. Text tokenization

 %% Cell type:code id:d8ccbafa97fba573 tags:

 ``` python
 # set up the tokenizer
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
 from tokenizers.processors import TemplateProcessing

 # setting the unknown token (e.g. for emojis)
 tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

 # adding special tokens
 #   [UNK] : unknown word/token
 #   [CLS] : starting token (new sentence sequence)
 #   [SEP] : separator for chaining multiple sentences
 #   [PAD] : padding needed for encoder input
 #   [MASK] : bad words!?
-trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+trainer = BpeTrainer(vocab_size=50000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

 # set up the pre-tokenizer -> this ensures, that the maximal token length is one word
 from tokenizers.pre_tokenizers import Whitespace
 tokenizer.pre_tokenizer = Whitespace()
 ```

 %% Cell type:code id:55cbac65a50a0199 tags:

 ``` python
-tokenizer.train(['./data/training-data/dev/newstest2013.en'], trainer)
+tokenizer.train(["data/training-data/eup/europarl-v7.de-en.de", "data/training-data/eup/europarl-v7.de-en.en"], trainer)

 # configure post processing
 tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
 )

 vocab_size = tokenizer.get_vocab_size()
 ```

 %% Output


-%% Cell type:code id:569b9a3425aa5800 tags:
-
-``` python
-print(data_array_en[15])
-
-# testing the trained tokenizer
-test_en = tokenizer.encode(data_array_en[14])
-test_de = tokenizer.encode(data_array_de[11])
-
-print(test_en.tokens)
-#print(test_en.ids)
-
-print(test_de.tokens)
-#print(test_de.ids)
-```
-
-%% Output
-
-    Government crisis coming, says Gallup
-    
-    ['[CLS]', 'They', 'also', 'predict', 'that', 'the', 'ECB', 'will', 'cut', 'interest', 'rates', 'twice', 'during', 'the', 'course', 'of', '2008', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
-    ['[CLS]', 'D', 'er', 'Welt', 'mark', 't', 'pre', 'is', 'f', 'ü', 'r', 'Ro', 'h', '[UNK]', 'l', 'st', 'ie', 'g', 'in', 'dies', 'em', 'J', 'ah', 'r', 'um', '52', 'Pro', 'z', 'ent', '-', 'im', 'ver', 'gang', 'en', 'en', 'Mon', 'at', 'er', 're', 'ich', 'te', 'der', 'Pre', 'is', 'pro', 'F', 'ass', 'des', 'sch', 'war', 'zen', 'G', 'old', 'es', 'na', 'he', 'z', 'u', '100', 'US', 'Dol', 'lar', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
-
 %% Cell type:markdown id:9c0f853775a802ec tags:

 ## 2. Prepare the training data

 %% Cell type:code id:2e4dc87ce98b6cdd tags:

 ``` python
 # Prepare training batch
-def training_data(batch_size: int = 10, max_tokens: int = 200) -> tuple[torch.tensor, torch.tensor]:
+def training_data(batch_size: int = 10, max_tokens: int = 50) -> tuple[torch.tensor, torch.tensor]:
    x_training_data = []
    y_training_data = []

    # select random sentences
-    batch_indices = torch.randint(0, len(data_array_en), (batch_size, ))
+    batch_indices = torch.randint(0, len(source), (batch_size, ))
    for idx in batch_indices:
-        x_training_data.append(data_array_en[idx])
-        y_training_data.append(data_array_de[idx])
+        x_training_data.append(target[idx])
+        y_training_data.append(source[idx])

    # tokenize data
    tokenizer.enable_padding(pad_id=3)
    x_training_data = tokenizer.encode_batch(x_training_data)
    tokenizer.enable_padding(pad_id=3, length=max_tokens)
    y_training_data = tokenizer.encode_batch(y_training_data)

    # extract ids for every sequence
    for i in range(len(batch_indices)):
        x_training_data[i] = x_training_data[i].ids
        y_training_data[i] = y_training_data[i].ids

    # 'tensorfy' x data
    x_training_data = torch.tensor(x_training_data)

    # 'tensorfy' & one hot encode y data
    #y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
    y_training_data = torch.tensor(y_training_data)
    return x_training_data, y_training_data

 print(training_data())
 ```

 %% Output

-    (tensor([[    1,   388,  1282,  3117,  9643,   707,   186,  4944,   430,    16,
+    (tensor([[    1,   556,  9472,   344,   386,   346,   984,   362,   472,    18,
+                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3],
+            [    1,   502,   565,  8649,  9315,   401,   346,  1625,  1566, 16999,
+              2138,   401, 10036,   346,    72,    17, 14392,  1849,   363, 15202,
+             17748,   335,   344,    18,     2,     3,     3],
+            [    1,   721,   342,   963,   335,   459,  1522,   360,  3311,   363,
+              1008, 13528, 25339,    18,     2,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3],
+            [    1,  2657,   440,  4595,  3001, 20519,   362,   346,  3019,  1561,
+             39484, 12799,   335,  3897,    17, 13314,   472,    18,     2,     3,
+                 3,     3,     3,     3,     3,     3,     3],
+            [    1, 10169,   525,   359,   632,   360, 15202,    68,  4380,   375,
+             37324,   477, 19158,   936,   914,   342, 13922,   360,   359,   346,
+              1445,  1403,   335,  2624,    18,     2,     3],
+            [    1, 14207,    16,   346,  4097,   362,   346,  5604,  4193, 10550,
+               338,  6024,  6130,  1009,   363,   341, 22069,  4488, 29208,    18,
+                 2,     3,     3,     3,     3,     3,     3],
+            [    1,   721,   546,  2619,  1048,   401, 21666,  7450,   360,   359,
+              6691,   360,  4356,  4897,  8267,    18,     2,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3],
+            [    1,   721,   342,   470,  3184,   401,   990,  1478,   668,   359,
+             21775,  9083, 37514,   362,   346,  8748,   401,   882,  7700,    18,
+                 2,     3,     3,     3,     3,     3,     3],
+            [    1,   721,   342,  4702,  4371,   401,   380,   486,  6034,  7872,
+               363,   401,   380,   659, 14235,  2583,   335,  1141,   360,  1258,
+              1313,   882,   486, 39184,   682,    18,     2],
+            [    1, 19047,    16,   941,   987,    16,   882,   440,   360,   359,
+               764,  1251,   401,  5140,     2,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3]]), tensor([[    1,   596,   578,   339,   435,   367,   956,  2403,  7978,    18,
                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,  3305,   134,    98,   899,   773,  2115,    14,    38,    88,
-               946,    37,   251,   106,   218,   271,   142,  5847,  1114, 11655,
-               101,  1330,    15,  1559,   895,  7792,  2615,   230,   233,   101,
-             14198,    16,   149,  6664,  1854,  3660,  6696,   101, 14217,    14,
-               406,    98,  5495,   676,   301,    98,  2203,    16,     2],
-            [    1,    25,    22,  1468,   109,   268,   489,   111,  2757,   107,
-             14328,  3592,    14,  5978,  1468,   111,  4932,   189,   232,    15,
-              1002,   742,   134,  5181, 10922,   118,    26,    22,  1468,   489,
-               111,   233,   241,   427,   249,  3620,  3707,    16,     2,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,     6,    40,   185,  1000,    81,   284,   111,   837,   142,
-                98,   394,    10,    78,  3276,   218,  2198,    14,    98,  3269,
-               407,    98,  1078,   393,    38,   150,   752,   218,  2198,    16,
-                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,  1188,   469,    97,  4975,  2153,    98,  1257,   109,    60,
-              4919,   137,    99,   231,   117,   115,   109,  5030,    16,     2,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   582,   619,   415, 12866,  2317,  5209,    16,   475,   352,
+              8180,  3296,  4473,  8201,    16,   367,   367,  1755,   510,   372,
+             12175, 24927, 10069,   369,  3211,  9613, 17730,  3142,    18,     2,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   467,   848,  2154,  4491,   352,  2016,   369,   367,  3126,
+               442,  6647,  8666, 23685,   448,  2812,  2171,    18,     2,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,    52,    42,  1872,  4697,   116,   940,  9446,  9590,  6492,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   720,   586,  7415, 17167,   352,  1744,    16,   549,   618,
+               628,   381,   729, 17335,  1014,  4289,  3610,   852, 12465,  3018,
+                18,     2,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   720,   570,  1359,   636,  6085, 27733,  1771,   784,   471,
+             11607,   510,   367, 28935,   406,  9854,  1962,   335,  2624, 20901,
+               834,    18,     2,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,  5097,  6837,   367, 23413, 10643,   501,   442,   352,  7203,
+             18377,   369,   442, 35812,  3579, 12270, 34707,   396,    18,     2,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   802,   415,  1869,  5362,    16,   655,   352,  7618,   729,
+               549,   504,   474,   335,   352,  2014,   415,    16, 13502,  3055,
+             19139,    18,     2,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   802,   786,   474,  7207,    16,   475,  1208,  2829, 17233,
+               358, 22631,   500,    18,     2,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,   802,   415,  6441, 12248,    16,   475,   416,   448,  7130,
+              9857,  4790,   369,   475,   416,  5613,  2122,  5719,   834,    16,
+               427,   383,  1639,    16, 25574,   628, 12850,    18,     2,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
+            [    1,  2475,   642,   339,   981,  3171,   916,   824,   500,  5140,
                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,    45,    39,    50, 10515,   136,  1739,   111,  2394,   221,
-               363,    15,  1140,   200, 11477,   101, 10716,  2777,   111,  3524,
-               239,   199,    98,  3154,   651,   430,    14,  3315,   118,   891,
-              1441,   111,   269,    97,    98,    52,    42,    16,     2,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,   337,   753,  1036,  8452,    80,   166,  2502,   111,  9658,
-               341,  2297,    98,   995,   142,   760,    98,    33,   638,   185,
-               186, 12033,  1063,    14,   122,  2571,   658,   173,  5604,  2395,
-                97,    98,  1857,   109,  8512,  1080,   118,    98,   135,  2512,
-                16,     2,     3,     3,     3,     3,     3,     3,     3],
-            [    1,   337,   347,    98,  1232,   104,  3311,   118,    98,  4241,
-               104,   972,   210,   971,    16,     2,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3],
-            [    1,   453,  1815, 11324,   668,   103,  3186,  3454,  1664,   405,
-               116,  1093,  5548,    14,   187,  7502,  1710,  3607,   209,   173,
-              2413,   157,   147,  1955,    60,  1016,    97, 11474,   192,    10,
-              3815,  1458,  1171,    16,     2,     3,     3,     3,     3,     3,
-                 3,     3,     3,     3,     3,     3,     3,     3,     3]]), tensor([[   1, 6182,   63,  ...,    3,    3,    3],
-            [   1,   54,  281,  ...,    3,    3,    3],
-            [   1,   25,   22,  ...,    3,    3,    3],
-            ...,
-            [   1,   35,  211,  ...,    3,    3,    3],
-            [   1,   35,  211,  ...,    3,    3,    3],
-            [   1, 8389,   42,  ...,    3,    3,    3]]))
+                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3]]))

 %% Cell type:markdown id:689e2e565cce2845 tags:

 ## 3. Build the sequence2sequence RNN

 %% Cell type:code id:e8d99510479108f4 tags:

 ``` python
 embedding_dimension = 100

 embedding_matrix_enc = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)
 embedding_matrix_dec = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)

 class Encoder(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bidirectional: bool = False):
        super(Encoder, self).__init__()

        self._hidden_size = hidden_size
        self._num_layers = num_layers

        # lstm layer
        self._lstm = torch. nn.LSTM(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    batch_first=True)

+        self._dropout = torch.nn.Dropout(0.1)
+


    def forward(self, embedded_sequence: torch.Tensor):
        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #hidden state WITH batches
        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #internal state WITH batches

-        #h_0 = torch.zeros(self._num_layers, self._hidden_size) #hidden state WITHOUT batches
-        #c_0 = torch.zeros(self._num_layers, self._hidden_size) #internal state WITHOUT batches
-
        output, (hn, cn) = self._lstm(embedded_sequence, (h_0, c_0))

        return output, hn, cn


 class Decoder(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int,
                 num_layers: int = 1, bidirectional: bool = False,
-                 max_tokens: int = 40, batch_size: int = 10):
+                 max_tokens: int = 40):
        super(Decoder, self).__init__()

        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._max_tokens = max_tokens
-        self._batch_size = batch_size

        # embedding matrix
        self._embedding = embedding_matrix_dec

        # lstm layer
        self._lstm = torch. nn.LSTM(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    batch_first=True)

        # output layer (fully connected linear layer)
        self._out = nn.Linear(hidden_size, output_size)

    def forward(self, x):
+        batch_size = x[0].size(0)
        hidden_state = x[1]
        cell_state = x[2]
        outputs = []

        # prepare start token
-        x_in = torch.empty(self._batch_size, 1, dtype=torch.long).fill_(1)
+        x_in = torch.empty(batch_size, 1, dtype=torch.long).fill_(1)

        for i in range(self._max_tokens):
            out, hidden_state, cell_state  = self.forward_step(x_in, hidden_state, cell_state)
            outputs.append(out)

            # Without teacher forcing: use its own predictions as the next input
            _, topi = out.topk(1)
            x_in = topi.squeeze(-1).detach()  # detach from history as input

        outputs = torch.cat(outputs, dim=1) # WTF is happening here!? -> TODO: Understand the code
        outputs = F.log_softmax(outputs, dim=-1)
        return outputs, hidden_state, cell_state

    def forward_step(self, x_in, hidden_state, cell_state):
        output = self._embedding(x_in)
        output = F.relu(output)
        output, (h_t, c_t) = self._lstm(output, (hidden_state, cell_state))
        output = self._out(output)
        return output, h_t, c_t
 ```

 %% Cell type:markdown id:535bc20b2f12f2da tags:

 ## 4. Train the model

 %% Cell type:code id:1f8d3152359f6658 tags:

 ``` python
-LSTM_hidden_size = 100
-max_tokens_per_sequence = 200
+LSTM_hidden_size = 128
+max_tokens_per_sequence = 70

 model = nn.Sequential(
    embedding_matrix_enc,
    Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size),
    Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,
            output_size=vocab_size, max_tokens=max_tokens_per_sequence)
 )

-num_epochs = 100
-optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
+num_epochs = 1000
+optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
 loss_function = torch.nn.NLLLoss()

 for i in range(1, num_epochs + 1):
    # reset gradients
    optimizer.zero_grad()

    # make prediction
-    x_train, y_train = training_data(max_tokens=max_tokens_per_sequence)
+    x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)
    predict = model(x_train)[0]

    # match dimensions of prediction & gold_label vector
    predict = predict.view(-1, predict.size(-1))
    y_train = y_train.view(-1)

    # calculate loss & propagate it backwards
    loss = loss_function(predict, y_train)
    loss.backward()

    optimizer.step()
    if i % 10 == 0:
        print("---- Iteration " + str(i) + " ----")
        print("loss: " + str(loss.item()))
 ```

 %% Output

    ---- Iteration 10 ----
-    loss: 2.325965642929077
+    loss: 9.291775703430176
    ---- Iteration 20 ----
-    loss: 2.1237902641296387
+    loss: 6.553857803344727
    ---- Iteration 30 ----
-    loss: 2.099151849746704
+    loss: 4.213151454925537
    ---- Iteration 40 ----
-    loss: 1.8893781900405884
+    loss: 3.1044561862945557
    ---- Iteration 50 ----
-    loss: 2.0475175380706787
+    loss: 3.47859263420105
+    ---- Iteration 60 ----
+    loss: 3.166140079498291
+    ---- Iteration 70 ----
+    loss: 3.3509914875030518
+    ---- Iteration 80 ----
+    loss: 2.626647710800171
+    ---- Iteration 90 ----
+    loss: 3.137316942214966
+    ---- Iteration 100 ----
+    loss: 3.088139295578003
+    ---- Iteration 110 ----
+    loss: 2.9085235595703125
+    ---- Iteration 120 ----
+    loss: 2.8475253582000732

    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
-Cell     In[136], line 20
+Cell     In[17], line 20
         17 optimizer.zero_grad()
         19 # make prediction
-    ---> 20 x_train, y_train = training_data(max_tokens=max_tokens_per_sequence)
+    ---> 20 x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)
         21 predict = model(x_train)[0]
         23 # match dimensions of prediction & gold_label vector
-Cell     In[128], line 28, in training_data(batch_size, max_tokens)
+Cell     In[8], line 28, in training_data(batch_size, max_tokens)
         24 x_training_data = torch.tensor(x_training_data)
         26 # 'tensorfy' & one hot encode y data
         27 #y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
    ---> 28 y_training_data = torch.tensor(y_training_data)
         29 return x_training_data, y_training_data
-    ValueError: expected sequence of length 200 at dim 1 (got 201)
+    ValueError: expected sequence of length 50 at dim 1 (got 62)

 %% Cell type:markdown id:44f9b74f91565a4a tags:

 ## 5. Sample from the model

 %% Cell type:code id:b95fb365f686125d tags:

 ``` python
+test_sequence =  ("Ist dies der Weg, oder nicht?.")
+
+test_sequence_enc = tokenizer.encode(test_sequence)
+
+print(test_sequence_enc.ids)
+
+test_sequence_batched = torch.tensor(test_sequence_enc.ids).view(1, -1)
+
+predict, _, _ = model(test_sequence_batched)
+_, topi = predict.topk(1)
+decoded_ids = topi.squeeze()
+tokenizer.decode(list(decoded_ids))
 ```
+
+%% Output
+
+    [1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+
+'Ich ist der der der'