Push changes

e9945170 · marvnsch · 9e70f019 · e9945170
Commit e9945170 authored Jan 8, 2024 by marvnsch
--- a/exploration.ipynb
+++ b/exploration.ipynb
@@ -341,15 +341,15 @@
    "collapsed": false
   },
   "source": [
-    "## 3. Build the sequence2sequence RNN"
+    "## 3. Build the sequence2sequence LSTM"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 62,
   "outputs": [],
   "source": [
-    "embedding_dimension = 100\n",
+    "embedding_dimension = 500\n",
    "\n",
    "embedding_matrix_enc = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)\n",
    "embedding_matrix_dec = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)\n",
@@ -361,6 +361,9 @@
    "        self._hidden_size = hidden_size\n",
    "        self._num_layers = num_layers\n",
    "\n",
+    "        # embedding matrix\n",
+    "        self._embedding = embedding_matrix_enc\n",
+    "        \n",
    "        # lstm layer\n",
    "        self._lstm = torch. nn.LSTM(input_size=input_size,\n",
    "                                    hidden_size=hidden_size,\n",
@@ -368,16 +371,12 @@
    "                                    bidirectional=bidirectional,\n",
    "                                    batch_first=True)\n",
    "        \n",
-    "        self._dropout = torch.nn.Dropout(0.1)\n",
-    "        \n",
-    "        \n",
-    "        \n",
-    "    def forward(self, embedded_sequence: torch.Tensor):\n",
-    "        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #hidden state WITH batches\n",
-    "        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #internal state WITH batches\n",
+    "    def forward(self, sequence: torch.Tensor):\n",
+    "        embedded_sequence = self._embedding(sequence)\n",
+    "        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)\n",
+    "        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)\n",
    "        \n",
    "        output, (hn, cn) = self._lstm(embedded_sequence, (h_0, c_0))\n",
-    "        \n",
    "        return output, hn, cn\n",
    "    \n",
    "\n",
@@ -404,10 +403,9 @@
    "        # output layer (fully connected linear layer)\n",
    "        self._out = nn.Linear(hidden_size, output_size)\n",
    "        \n",
-    "    def forward(self, x):\n",
-    "        batch_size = x[0].size(0)\n",
-    "        hidden_state = x[1]\n",
-    "        cell_state = x[2]\n",
+    "    def forward(self, enc_out: torch.tensor, hidden_state: torch.tensor,\n",
+    "                cell_state: torch.tensor, target_tensor: torch.tensor = None):\n",
+    "        batch_size = enc_out.size(0)\n",
    "        outputs = []\n",
    "        \n",
    "        # prepare start token\n",
@@ -417,12 +415,16 @@
    "            out, hidden_state, cell_state  = self.forward_step(x_in, hidden_state, cell_state)\n",
    "            outputs.append(out)\n",
    "\n",
+    "            if target_tensor is not None:\n",
+    "                # Teacher forcing: Feed the target as the next input\n",
+    "                x_in = target_tensor[:, i].unsqueeze(1) # Teacher forcing\n",
+    "            else:\n",
    "                # Without teacher forcing: use its own predictions as the next input\n",
    "                _, topi = out.topk(1)\n",
    "                x_in = topi.squeeze(-1).detach()  # detach from history as input\n",
    "\n",
-    "        outputs = torch.cat(outputs, dim=1) # WTF is happening here!? -> TODO: Understand the code\n",
-    "        outputs = F.log_softmax(outputs, dim=-1)\n",
+    "        outputs = torch.cat(outputs, dim=1)\n",
+    "        #outputs = F.log_softmax(outputs, dim=-1)\n",
    "        return outputs, hidden_state, cell_state\n",
    "    \n",
    "    def forward_step(self, x_in, hidden_state, cell_state):\n",
@@ -435,8 +437,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T16:15:23.365173Z",
-     "start_time": "2024-01-06T16:15:23.251387Z"
+     "end_time": "2024-01-08T17:04:59.155420Z",
+     "start_time": "2024-01-08T17:04:58.607834Z"
    }
   },
   "id": "e8d99510479108f4"
@@ -453,80 +455,58 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 63,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---- Iteration 10 ----\n",
-      "loss: 9.291775703430176\n",
+      "loss: -0.045227549970149994\n",
      "---- Iteration 20 ----\n",
-      "loss: 6.553857803344727\n",
+      "loss: -0.1404053419828415\n",
      "---- Iteration 30 ----\n",
-      "loss: 4.213151454925537\n",
-      "---- Iteration 40 ----\n",
-      "loss: 3.1044561862945557\n",
-      "---- Iteration 50 ----\n",
-      "loss: 3.47859263420105\n",
-      "---- Iteration 60 ----\n",
-      "loss: 3.166140079498291\n",
-      "---- Iteration 70 ----\n",
-      "loss: 3.3509914875030518\n",
-      "---- Iteration 80 ----\n",
-      "loss: 2.626647710800171\n",
-      "---- Iteration 90 ----\n",
-      "loss: 3.137316942214966\n",
-      "---- Iteration 100 ----\n",
-      "loss: 3.088139295578003\n",
-      "---- Iteration 110 ----\n",
-      "loss: 2.9085235595703125\n",
-      "---- Iteration 120 ----\n",
-      "loss: 2.8475253582000732\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "expected sequence of length 50 at dim 1 (got 62)",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[17], line 20\u001B[0m\n\u001B[1;32m     17\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[1;32m     19\u001B[0m \u001B[38;5;66;03m# make prediction\u001B[39;00m\n\u001B[0;32m---> 20\u001B[0m x_train, y_train \u001B[38;5;241m=\u001B[39m \u001B[43mtraining_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbatch_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m32\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmax_tokens\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mmax_tokens_per_sequence\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     21\u001B[0m predict \u001B[38;5;241m=\u001B[39m model(x_train)[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m     23\u001B[0m \u001B[38;5;66;03m# match dimensions of prediction & gold_label vector\u001B[39;00m\n",
-      "Cell \u001B[0;32mIn[8], line 28\u001B[0m, in \u001B[0;36mtraining_data\u001B[0;34m(batch_size, max_tokens)\u001B[0m\n\u001B[1;32m     24\u001B[0m x_training_data \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mtensor(x_training_data)\n\u001B[1;32m     26\u001B[0m \u001B[38;5;66;03m# 'tensorfy' & one hot encode y data\u001B[39;00m\n\u001B[1;32m     27\u001B[0m \u001B[38;5;66;03m#y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)\u001B[39;00m\n\u001B[0;32m---> 28\u001B[0m y_training_data \u001B[38;5;241m=\u001B[39m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtensor\u001B[49m\u001B[43m(\u001B[49m\u001B[43my_training_data\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     29\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m x_training_data, y_training_data\n",
-      "\u001B[0;31mValueError\u001B[0m: expected sequence of length 50 at dim 1 (got 62)"
+      "loss: -0.22252100706100464\n"
     ]
    }
   ],
   "source": [
-    "LSTM_hidden_size = 128\n",
+    "LSTM_hidden_size = 500\n",
    "max_tokens_per_sequence = 70\n",
    "\n",
    "model = nn.Sequential(\n",
-    "    embedding_matrix_enc,\n",
    "    Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size),\n",
    "    Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,\n",
    "            output_size=vocab_size, max_tokens=max_tokens_per_sequence)\n",
    ")\n",
    "\n",
-    "num_epochs = 1000\n",
-    "optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)\n",
-    "loss_function = torch.nn.NLLLoss()\n",
+    "num_epochs = 30\n",
+    "optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "# create encoder / decoder instances\n",
+    "encoder = Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size)\n",
+    "decoder = Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,\n",
+    "        output_size=vocab_size, max_tokens=max_tokens_per_sequence)\n",
    "\n",
    "for i in range(1, num_epochs + 1):\n",
    "    # reset gradients\n",
    "    optimizer.zero_grad()\n",
    "\n",
-    "    # make prediction\n",
+    "    # get training data\n",
    "    x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)\n",
-    "    predict = model(x_train)[0]\n",
+    "\n",
+    "    # make prediction\n",
+    "    encoder_out, encoder_h, encoder_c = encoder(x_train)\n",
+    "    predict = decoder(encoder_out, encoder_h, encoder_c, y_train)\n",
+    "    predict = predict[0]\n",
    "\n",
    "    # match dimensions of prediction & gold_label vector\n",
    "    predict = predict.view(-1, predict.size(-1))\n",
    "    y_train = y_train.view(-1)\n",
    "\n",
    "    # calculate loss & propagate it backwards\n",
-    "    loss = loss_function(predict, y_train)\n",
+    "    loss = criterion(predict, y_train)\n",
    "    loss.backward()\n",
    "\n",
    "    optimizer.step()\n",
@@ -537,8 +517,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T16:19:42.036848Z",
-     "start_time": "2024-01-06T16:18:33.291526Z"
+     "end_time": "2024-01-08T17:06:02.509807Z",
+     "start_time": "2024-01-08T17:05:01.822284Z"
    }
   },
   "id": "1f8d3152359f6658"
@@ -555,43 +535,53 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 61,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]\n"
+      "['[CLS]', 'H', 'i', 'are', 'you', 'there', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']\n",
+      "tensor([    1,     1,     1, 32460, 32460, 28617,  7251, 47089, 38628, 38628,\n",
+      "        38628, 38495, 31040, 14290,  4593, 41094, 13045, 17103, 45127, 18564,\n",
+      "         5320,  5320,  5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,\n",
+      "         5320,  5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,\n",
+      "         5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,\n",
+      "         5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,  5320,\n",
+      "         5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,  5320,  5320])\n"
     ]
    },
    {
     "data": {
-      "text/plain": "'Ich ist der der der'"
+      "text/plain": "'Quantität Quantität drastischen committees Vermächtnis jana jana jana Aufrichtigkeit Fusionen ströme More Tellereisen Mod verschoben emaking auszubauen unch unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch'"
     },
-     "execution_count": 16,
+     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "test_sequence =  (\"Ist dies der Weg, oder nicht?.\")\n",
+    "test_sequence =  (\"Hi are you there\")\n",
    "\n",
    "test_sequence_enc = tokenizer.encode(test_sequence)\n",
    "\n",
-    "print(test_sequence_enc.ids)\n",
+    "print(test_sequence_enc.tokens)\n",
    "\n",
    "test_sequence_batched = torch.tensor(test_sequence_enc.ids).view(1, -1)\n",
    "\n",
-    "predict, _, _ = model(test_sequence_batched)\n",
+    "encoder_out, encoder_h, encoder_c = encoder(test_sequence_batched)\n",
+    "predict, _, _ = decoder(encoder_out, encoder_h, encoder_c)\n",
+    "\n",
    "_, topi = predict.topk(1)\n",
    "decoded_ids = topi.squeeze()\n",
+    "print(decoded_ids)\n",
    "tokenizer.decode(list(decoded_ids))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-06T16:17:11.259876Z",
-     "start_time": "2024-01-06T16:17:11.211111Z"
+     "end_time": "2024-01-08T17:04:27.406472Z",
+     "start_time": "2024-01-08T17:04:27.078634Z"
    }
   },
   "id": "b95fb365f686125d"

 %% Cell type:code id:initial_id tags:

 ``` python
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import seaborn as sns
 from matplotlib import pyplot as plt
 from pathlib import Path
 ```

 %% Cell type:markdown id:d8d7f32150682efd tags:

 ## 0. Prepare the data

 %% Cell type:code id:f7c39c06ce3a14db tags:

 ``` python
 def load_data() -> list[str]:
    with open("data/training-data/eup/europarl-v7.de-en.de", "r", encoding="utf8") as f:
        data_de = [line.rstrip("\n") for line in f]
    with open("data/training-data/eup/europarl-v7.de-en.en", "r", encoding="utf8") as f:
        data_en = [line.rstrip("\n") for line in f]

    ltd = set() # save lines to delete later

    for i in range(max(len(data_de), len(data_en))):
        # Move sentence to next line if line is empty other file
        if data_de[i] == "":
            data_en[i+1] = data_en[i] + " " + data_en[i+1]
            ltd.add(i)
        if data_en[i] == "":
            data_de[i+1] = data_de[i] + " " + data_de[i+1]
            ltd.add(i)

        # Remove lines, where difference in words is > 40%
        if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:
            ltd.add(i)

        # Remove lines < 3 words or > 25 words
        if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 25:
            ltd.add(i)

    temp_de = [l for i, l in enumerate(data_de) if i not in ltd]
    data_de = temp_de
    temp_en = [l for i, l in enumerate(data_en) if i not in ltd]
    data_en = temp_en
    print(len(data_de),len(data_en))

    # Print 3 random sentence pairs
    ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))
    for i in ix:
        print(f"Zeile: {i}\nDeutsch: {data_de[i]}\nEnglish: {data_en[i]}\n")

    print(f"\nNumber of lines: {len(data_de), len(data_en)}")

    return data_de, data_en

 def count_words(string: str) -> int:
    return len(string.split())


 source, target = load_data()
 ```

 %% Output

    1046809 1046809
    Zeile: 993209
    Deutsch: Aber es muß auch darum gehen, Anreize zu schaffen für einen umweltfreundlichen lokalen öffentlichen Nahverkehr.
    English: But it is also necessary to create incentives for environmentally friendly local public transport.
    
    Zeile: 459853
    Deutsch: Vielleicht sollte er dramatisch verlangsamt werden?
    English: Perhaps it should be slowed down dramatically?
    
    Zeile: 605086
    Deutsch: Die Prämien haben im Übrigen durchaus positive grenzüberschreitende Wirkungen.
    English: The incentives have also had a positive cross-border impact.
    
    
    Number of lines: (1046809, 1046809)

 %% Cell type:markdown id:f2beddcc4122495a tags:

 ## 1. Text tokenization

 %% Cell type:code id:d8ccbafa97fba573 tags:

 ``` python
 # set up the tokenizer
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
 from tokenizers.processors import TemplateProcessing

 # setting the unknown token (e.g. for emojis)
 tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

 # adding special tokens
 #   [UNK] : unknown word/token
 #   [CLS] : starting token (new sentence sequence)
 #   [SEP] : separator for chaining multiple sentences
 #   [PAD] : padding needed for encoder input
 #   [MASK] : bad words!?
 trainer = BpeTrainer(vocab_size=50000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

 # set up the pre-tokenizer -> this ensures, that the maximal token length is one word
 from tokenizers.pre_tokenizers import Whitespace
 tokenizer.pre_tokenizer = Whitespace()
 ```

 %% Cell type:code id:55cbac65a50a0199 tags:

 ``` python
 tokenizer.train(["data/training-data/eup/europarl-v7.de-en.de", "data/training-data/eup/europarl-v7.de-en.en"], trainer)

 # configure post processing
 tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
 )

 vocab_size = tokenizer.get_vocab_size()
 ```

 %% Output


 %% Cell type:markdown id:9c0f853775a802ec tags:

 ## 2. Prepare the training data

 %% Cell type:code id:2e4dc87ce98b6cdd tags:

 ``` python
 # Prepare training batch
 def training_data(batch_size: int = 10, max_tokens: int = 50) -> tuple[torch.tensor, torch.tensor]:
    x_training_data = []
    y_training_data = []

    # select random sentences
    batch_indices = torch.randint(0, len(source), (batch_size, ))
    for idx in batch_indices:
        x_training_data.append(target[idx])
        y_training_data.append(source[idx])

    # tokenize data
    tokenizer.enable_padding(pad_id=3)
    x_training_data = tokenizer.encode_batch(x_training_data)
    tokenizer.enable_padding(pad_id=3, length=max_tokens)
    y_training_data = tokenizer.encode_batch(y_training_data)

    # extract ids for every sequence
    for i in range(len(batch_indices)):
        x_training_data[i] = x_training_data[i].ids
        y_training_data[i] = y_training_data[i].ids

    # 'tensorfy' x data
    x_training_data = torch.tensor(x_training_data)

    # 'tensorfy' & one hot encode y data
    #y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
    y_training_data = torch.tensor(y_training_data)
    return x_training_data, y_training_data

 print(training_data())
 ```

 %% Output

    (tensor([[    1,   556,  9472,   344,   386,   346,   984,   362,   472,    18,
                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3],
            [    1,   502,   565,  8649,  9315,   401,   346,  1625,  1566, 16999,
              2138,   401, 10036,   346,    72,    17, 14392,  1849,   363, 15202,
             17748,   335,   344,    18,     2,     3,     3],
            [    1,   721,   342,   963,   335,   459,  1522,   360,  3311,   363,
              1008, 13528, 25339,    18,     2,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3],
            [    1,  2657,   440,  4595,  3001, 20519,   362,   346,  3019,  1561,
             39484, 12799,   335,  3897,    17, 13314,   472,    18,     2,     3,
                 3,     3,     3,     3,     3,     3,     3],
            [    1, 10169,   525,   359,   632,   360, 15202,    68,  4380,   375,
             37324,   477, 19158,   936,   914,   342, 13922,   360,   359,   346,
              1445,  1403,   335,  2624,    18,     2,     3],
            [    1, 14207,    16,   346,  4097,   362,   346,  5604,  4193, 10550,
               338,  6024,  6130,  1009,   363,   341, 22069,  4488, 29208,    18,
                 2,     3,     3,     3,     3,     3,     3],
            [    1,   721,   546,  2619,  1048,   401, 21666,  7450,   360,   359,
              6691,   360,  4356,  4897,  8267,    18,     2,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3],
            [    1,   721,   342,   470,  3184,   401,   990,  1478,   668,   359,
             21775,  9083, 37514,   362,   346,  8748,   401,   882,  7700,    18,
                 2,     3,     3,     3,     3,     3,     3],
            [    1,   721,   342,  4702,  4371,   401,   380,   486,  6034,  7872,
               363,   401,   380,   659, 14235,  2583,   335,  1141,   360,  1258,
              1313,   882,   486, 39184,   682,    18,     2],
            [    1, 19047,    16,   941,   987,    16,   882,   440,   360,   359,
               764,  1251,   401,  5140,     2,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3]]), tensor([[    1,   596,   578,   339,   435,   367,   956,  2403,  7978,    18,
                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   582,   619,   415, 12866,  2317,  5209,    16,   475,   352,
              8180,  3296,  4473,  8201,    16,   367,   367,  1755,   510,   372,
             12175, 24927, 10069,   369,  3211,  9613, 17730,  3142,    18,     2,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   467,   848,  2154,  4491,   352,  2016,   369,   367,  3126,
               442,  6647,  8666, 23685,   448,  2812,  2171,    18,     2,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   720,   586,  7415, 17167,   352,  1744,    16,   549,   618,
               628,   381,   729, 17335,  1014,  4289,  3610,   852, 12465,  3018,
                18,     2,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   720,   570,  1359,   636,  6085, 27733,  1771,   784,   471,
             11607,   510,   367, 28935,   406,  9854,  1962,   335,  2624, 20901,
               834,    18,     2,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,  5097,  6837,   367, 23413, 10643,   501,   442,   352,  7203,
             18377,   369,   442, 35812,  3579, 12270, 34707,   396,    18,     2,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   802,   415,  1869,  5362,    16,   655,   352,  7618,   729,
               549,   504,   474,   335,   352,  2014,   415,    16, 13502,  3055,
             19139,    18,     2,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   802,   786,   474,  7207,    16,   475,  1208,  2829, 17233,
               358, 22631,   500,    18,     2,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,   802,   415,  6441, 12248,    16,   475,   416,   448,  7130,
              9857,  4790,   369,   475,   416,  5613,  2122,  5719,   834,    16,
               427,   383,  1639,    16, 25574,   628, 12850,    18,     2,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3],
            [    1,  2475,   642,   339,   981,  3171,   916,   824,   500,  5140,
                 2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
                 3,     3,     3,     3,     3,     3,     3,     3,     3,     3]]))

 %% Cell type:markdown id:689e2e565cce2845 tags:

-## 3. Build the sequence2sequence RNN
+## 3. Build the sequence2sequence LSTM

 %% Cell type:code id:e8d99510479108f4 tags:

 ``` python
-embedding_dimension = 100
+embedding_dimension = 500

 embedding_matrix_enc = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)
 embedding_matrix_dec = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)

 class Encoder(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bidirectional: bool = False):
        super(Encoder, self).__init__()

        self._hidden_size = hidden_size
        self._num_layers = num_layers

+        # embedding matrix
+        self._embedding = embedding_matrix_enc
+
        # lstm layer
        self._lstm = torch. nn.LSTM(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    batch_first=True)

-        self._dropout = torch.nn.Dropout(0.1)
-
-
-
-    def forward(self, embedded_sequence: torch.Tensor):
-        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #hidden state WITH batches
-        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #internal state WITH batches
+    def forward(self, sequence: torch.Tensor):
+        embedded_sequence = self._embedding(sequence)
+        h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)
+        c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)

        output, (hn, cn) = self._lstm(embedded_sequence, (h_0, c_0))
-
        return output, hn, cn


 class Decoder(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int,
                 num_layers: int = 1, bidirectional: bool = False,
                 max_tokens: int = 40):
        super(Decoder, self).__init__()

        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._max_tokens = max_tokens

        # embedding matrix
        self._embedding = embedding_matrix_dec

        # lstm layer
        self._lstm = torch. nn.LSTM(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    batch_first=True)

        # output layer (fully connected linear layer)
        self._out = nn.Linear(hidden_size, output_size)

-    def forward(self, x):
-        batch_size = x[0].size(0)
-        hidden_state = x[1]
-        cell_state = x[2]
+    def forward(self, enc_out: torch.tensor, hidden_state: torch.tensor,
+                cell_state: torch.tensor, target_tensor: torch.tensor = None):
+        batch_size = enc_out.size(0)
        outputs = []

        # prepare start token
        x_in = torch.empty(batch_size, 1, dtype=torch.long).fill_(1)

        for i in range(self._max_tokens):
            out, hidden_state, cell_state  = self.forward_step(x_in, hidden_state, cell_state)
            outputs.append(out)

-            # Without teacher forcing: use its own predictions as the next input
-            _, topi = out.topk(1)
-            x_in = topi.squeeze(-1).detach()  # detach from history as input
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                x_in = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = out.topk(1)
+                x_in = topi.squeeze(-1).detach()  # detach from history as input

-        outputs = torch.cat(outputs, dim=1) # WTF is happening here!? -> TODO: Understand the code
-        outputs = F.log_softmax(outputs, dim=-1)
+        outputs = torch.cat(outputs, dim=1)
+        #outputs = F.log_softmax(outputs, dim=-1)
        return outputs, hidden_state, cell_state

    def forward_step(self, x_in, hidden_state, cell_state):
        output = self._embedding(x_in)
        output = F.relu(output)
        output, (h_t, c_t) = self._lstm(output, (hidden_state, cell_state))
        output = self._out(output)
        return output, h_t, c_t
 ```

 %% Cell type:markdown id:535bc20b2f12f2da tags:

 ## 4. Train the model

 %% Cell type:code id:1f8d3152359f6658 tags:

 ``` python
-LSTM_hidden_size = 128
+LSTM_hidden_size = 500
 max_tokens_per_sequence = 70

 model = nn.Sequential(
-    embedding_matrix_enc,
    Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size),
    Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,
            output_size=vocab_size, max_tokens=max_tokens_per_sequence)
 )

-num_epochs = 1000
-optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
-loss_function = torch.nn.NLLLoss()
+num_epochs = 30
+optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
+criterion = torch.nn.CrossEntropyLoss()
+
+# create encoder / decoder instances
+encoder = Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size)
+decoder = Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,
+        output_size=vocab_size, max_tokens=max_tokens_per_sequence)

 for i in range(1, num_epochs + 1):
    # reset gradients
    optimizer.zero_grad()

-    # make prediction
+    # get training data
    x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)
-    predict = model(x_train)[0]
+
+    # make prediction
+    encoder_out, encoder_h, encoder_c = encoder(x_train)
+    predict = decoder(encoder_out, encoder_h, encoder_c, y_train)
+    predict = predict[0]

    # match dimensions of prediction & gold_label vector
    predict = predict.view(-1, predict.size(-1))
    y_train = y_train.view(-1)

    # calculate loss & propagate it backwards
-    loss = loss_function(predict, y_train)
+    loss = criterion(predict, y_train)
    loss.backward()

    optimizer.step()
    if i % 10 == 0:
        print("---- Iteration " + str(i) + " ----")
        print("loss: " + str(loss.item()))
 ```

 %% Output

    ---- Iteration 10 ----
-    loss: 9.291775703430176
+    loss: -0.045227549970149994
    ---- Iteration 20 ----
-    loss: 6.553857803344727
+    loss: -0.1404053419828415
    ---- Iteration 30 ----
-    loss: 4.213151454925537
-    ---- Iteration 40 ----
-    loss: 3.1044561862945557
-    ---- Iteration 50 ----
-    loss: 3.47859263420105
-    ---- Iteration 60 ----
-    loss: 3.166140079498291
-    ---- Iteration 70 ----
-    loss: 3.3509914875030518
-    ---- Iteration 80 ----
-    loss: 2.626647710800171
-    ---- Iteration 90 ----
-    loss: 3.137316942214966
-    ---- Iteration 100 ----
-    loss: 3.088139295578003
-    ---- Iteration 110 ----
-    loss: 2.9085235595703125
-    ---- Iteration 120 ----
-    loss: 2.8475253582000732
-
-    ---------------------------------------------------------------------------
-    ValueError                                Traceback (most recent call last)
-Cell     In[17], line 20
-         17 optimizer.zero_grad()
-         19 # make prediction
-    ---> 20 x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)
-         21 predict = model(x_train)[0]
-         23 # match dimensions of prediction & gold_label vector
-Cell     In[8], line 28, in training_data(batch_size, max_tokens)
-         24 x_training_data = torch.tensor(x_training_data)
-         26 # 'tensorfy' & one hot encode y data
-         27 #y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
-    ---> 28 y_training_data = torch.tensor(y_training_data)
-         29 return x_training_data, y_training_data
-    ValueError: expected sequence of length 50 at dim 1 (got 62)
+    loss: -0.22252100706100464

 %% Cell type:markdown id:44f9b74f91565a4a tags:

 ## 5. Sample from the model

 %% Cell type:code id:b95fb365f686125d tags:

 ``` python
-test_sequence =  ("Ist dies der Weg, oder nicht?.")
+test_sequence =  ("Hi are you there")

 test_sequence_enc = tokenizer.encode(test_sequence)

-print(test_sequence_enc.ids)
+print(test_sequence_enc.tokens)

 test_sequence_batched = torch.tensor(test_sequence_enc.ids).view(1, -1)

-predict, _, _ = model(test_sequence_batched)
+encoder_out, encoder_h, encoder_c = encoder(test_sequence_batched)
+predict, _, _ = decoder(encoder_out, encoder_h, encoder_c)
+
 _, topi = predict.topk(1)
 decoded_ids = topi.squeeze()
+print(decoded_ids)
 tokenizer.decode(list(decoded_ids))
 ```

 %% Output

-    [1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+    ['[CLS]', 'H', 'i', 'are', 'you', 'there', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
+    tensor([    1,     1,     1, 32460, 32460, 28617,  7251, 47089, 38628, 38628,
+            38628, 38495, 31040, 14290,  4593, 41094, 13045, 17103, 45127, 18564,
+             5320,  5320,  5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,
+             5320,  5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,
+             5320,  5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,
+             5320,  5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,  5320,
+             5320,  5320,  5320,  7784,  7784, 34640,  5320,  5320,  5320,  5320])

-'Ich ist der der der'
+'Quantität Quantität drastischen committees Vermächtnis jana jana jana Aufrichtigkeit Fusionen ströme More Tellereisen Mod verschoben emaking auszubauen unch unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch'