Add copy cat notebook

f37b839d · marvnsch · f6b60e06 · f37b839d
Commit f37b839d authored Jan 17, 2024 by marvnsch
--- a/copycat.ipynb
+++ b/copycat.ipynb
@@ -14,6 +14,7 @@
    "import torch.optim as optim\n",
    "import random\n",
    "\n",
+    "from tinycss2 import tokenizer\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "from tokenizers.trainers import BpeTrainer\n",
@@ -130,14 +131,18 @@
   "source": [
    "def training_data(source: list[str],\n",
    "                  target: list[str],\n",
+    "                  dataset_size: int,\n",
    "                  batch_size: int = 64,\n",
    "                  sort: bool = True) -> tuple[torch.tensor, torch.tensor]:\n",
    "    tokenizer_de.no_padding()\n",
    "    tokenizer_en.no_padding()\n",
    "    \n",
+    "    if dataset_size > len(source):\n",
+    "        raise IndexError(\"Dataset size is larger than the source data\")\n",
+    "    \n",
    "    # sort the training data if true\n",
    "    if sort:\n",
-    "        temp = ([list(a) for a in zip(source, target)])\n",
+    "        temp = ([list(a) for a in zip(source[:dataset_size], target[:dataset_size])])\n",
    "        temp.sort(key=lambda s: len(s[0]) + len(s[1]))\n",
    "        source, target = list(zip(*temp))\n",
    "\n",
@@ -176,9 +181,7 @@
   "outputs": [],
   "source": [
    "# data test cell\n",
-    "print(len(de)/64)\n",
-    "\n",
-    "for idx, _ in enumerate(training_data(source=de, target=en, batch_size=64)):\n",
+    "for idx, _ in enumerate(training_data(source=de, target=en, dataset_size=10000, batch_size=64)):\n",
    "    print(idx)"
   ],
   "metadata": {
@@ -236,7 +239,7 @@
    "        self.fc = nn.Linear(hidden_size, output_size)\n",
    "        \n",
    "    def forward(self, x, hidden, cell):\n",
-    "        x = x.view(1, -1)\n",
+    "        x = x.reshape(1, -1)\n",
    "        # shape x : (1, batch_size)\n",
    "        embedding = self.dropout(self.embedding(x))\n",
    "        # embedding shape : (1, batch_size, embedding_size)\n",
@@ -244,7 +247,7 @@
    "        # shape output : (1, batch_size, hidden_size)\n",
    "        predictions = self.fc(output)\n",
    "        # shape predictions : (1, batch_size, vocab_len)\n",
-    "        predictions = predictions.squeeze(1)\n",
+    "        predictions = predictions.squeeze(0)\n",
    "        \n",
    "        return predictions, hidden, cell\n",
    "    \n",
@@ -254,14 +257,13 @@
    "        self.encoder = encoder\n",
    "        self.decoder = decoder\n",
    "        \n",
-    "    def forward(self, source, target, teacher_forcing_ratio: float = 0.5):\n",
+    "    def forward(self, source, target = None, teacher_forcing_ratio: float = 0.5):\n",
    "        batch_size = source.shape[1]\n",
    "        target_len = target.shape[0]\n",
    "        \n",
    "        outputs = torch.zeros(target_len, batch_size, target_vocab_size)\n",
    "        \n",
    "        hidden, cell = self.encoder(source)\n",
-    "        \n",
    "        x = target[0]\n",
    "        \n",
    "        for t in range(1, target_len):\n",
@@ -269,20 +271,22 @@
    "            \n",
    "            outputs[t] = output\n",
    "            \n",
-    "            best_guess = output.argmax(2)\n",
-    "            \n",
+    "            best_guess = output.argmax(1)\n",
+    "            if target is not None:\n",
    "                x = target[t] if random.random() < teacher_forcing_ratio else best_guess\n",
+    "            else:\n",
+    "                x = best_guess\n",
    "        return outputs"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-01-14T22:07:32.406911Z",
-     "start_time": "2024-01-14T22:07:32.401944Z"
+     "end_time": "2024-01-17T15:15:48.836634Z",
+     "start_time": "2024-01-17T15:15:48.834028Z"
    }
   },
   "id": "3b2c4dbc74a1f144",
-   "execution_count": 129
+   "execution_count": 76
  },
  {
   "cell_type": "markdown",
@@ -296,12 +300,21 @@
  },
  {
   "cell_type": "code",
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/3\n"
+     ]
+    }
+   ],
   "source": [
    "# training hyperparameters\n",
-    "num_epochs = 20\n",
+    "num_epochs = 3\n",
    "learning_rate = 0.001\n",
-    "batch_size = 64\n",
+    "batch_size = 128\n",
+    "dataset_size = 5000\n",
    "\n",
    "# model hyperparameters\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -312,8 +325,8 @@
    "decoder_embedding_size = 300\n",
    "hidden_size = 1024\n",
    "num_layers = 2\n",
-    "encoder_dropout = 0.5\n",
-    "decoder_dropout = 0.5\n",
+    "encoder_dropout = 0.1\n",
+    "decoder_dropout = 0.1\n",
    "\n",
    "encoder_net = Encoder(input_size=input_size_encoder, \n",
    "                      embedding_size=encoder_embedding_size, \n",
@@ -335,28 +348,83 @@
    "\n",
    "for epoch in range(num_epochs):\n",
    "    print('Epoch {}/{}'.format(epoch + 1, num_epochs))\n",
+    "    loss_value = 0\n",
    "    \n",
    "    for batch_idx, (x_train, y_train) in enumerate(training_data(source=en, \n",
-    "                                                                 target=en)):\n",
+    "                                                                 target=en,\n",
+    "                                                                 dataset_size=dataset_size,\n",
+    "                                                                 batch_size=batch_size)):\n",
+    "        optimizer.zero_grad()\n",
+    "        \n",
    "        output = model(x_train, y_train)\n",
+    "        output_debug = output\n",
    "        output = output[1:].reshape(-1, output.shape[2])\n",
    "        y_train = y_train[1:].reshape(-1)\n",
    "        \n",
-    "        optimizer.zero_grad()\n",
    "        loss = criterion(output, y_train)\n",
    "        \n",
    "        loss.backward()\n",
    "        \n",
-    "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\n",
-    "        print(batch_idx)\n",
+    "        loss_value += loss.item()\n",
+    "        \n",
+    "        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\n",
    "        \n",
-    "    print(\"loss: \" + str(loss.item()))\n"
+    "    print(\"loss: \" + str(loss_value / (dataset_size / batch_size)))"
   ],
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "is_executing": true,
+    "ExecuteTime": {
+     "start_time": "2024-01-17T15:18:52.707157Z"
+    }
   },
   "id": "ee166d65b3b975d",
   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8, 1])\n",
+      "mischung Tragödie ergänzen ergänzen Mitbürger daß daß\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test the model\n",
+    "\n",
+    "test_sentence_en = \"This is the first point.\"\n",
+    "test_sentence_de = \"Das ist der erste Punkt.\"\n",
+    "\n",
+    "test_sentence_en_encoded = tokenizer_en.encode(test_sentence_en)\n",
+    "test_sentence_de_encoded = tokenizer_de.encode(test_sentence_de)\n",
+    "\n",
+    "target_vector = torch.zeros(len(test_sentence_de_encoded.ids), 1)\n",
+    "\n",
+    "model.eval()\n",
+    "x_test = torch.transpose(torch.tensor([test_sentence_en_encoded.ids]), 0, 1)\n",
+    "y_test = torch.transpose(torch.tensor([test_sentence_de_encoded.ids]), 0, 1)\n",
+    "print(y_test.shape)\n",
+    "prediction = model(x_test, y_test, teacher_forcing_ratio=0.0)\n",
+    "\n",
+    "logits = torch.nn.functional.softmax(prediction, dim=2)\n",
+    "\n",
+    "result_ids = logits.argmax(dim=2)\n",
+    "\n",
+    "print(tokenizer_de.decode(list(result_ids)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-17T15:18:46.448865Z",
+     "start_time": "2024-01-17T15:18:46.419022Z"
+    }
+   },
+   "id": "fa8a86342abe0a97",
+   "execution_count": 79
  }
 ],
 "metadata": {

 %% Cell type:code id:initial_id tags:

 ``` python
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import random

+from tinycss2 import tokenizer
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
 from tokenizers.pre_tokenizers import Whitespace
 from tokenizers.processors import TemplateProcessing
 ```

 %% Cell type:markdown id:2b9477923b668978 tags:

 # Data Preparation

 %% Cell type:code id:dbc5f26f27746098 tags:

 ``` python
 def load_data() -> tuple[list[str], list[str]]:
    with open("data/training-data/eup/europarl-v7.de-en.de", "r", encoding="utf8") as f:
        data_de = [line.rstrip("\n") for line in f]
    with open("data/training-data/eup/europarl-v7.de-en.en", "r", encoding="utf8") as f:
        data_en = [line.rstrip("\n") for line in f]

    ltd = set() # save lines to delete later

    for i in range(max(len(data_de), len(data_en))):
        # Move sentence to next line if line is empty other file
        if data_de[i] == "":
            data_en[i+1] = data_en[i] + " " + data_en[i+1]
            ltd.add(i)
        if data_en[i] == "":
            data_de[i+1] = data_de[i] + " " + data_de[i+1]
            ltd.add(i)

        # Remove lines, where difference in words is > 40%
        if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:
            ltd.add(i)

        # Remove lines < 3 words or > 10 words
        if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 10:
            ltd.add(i)

    temp_de = [l for i, l in enumerate(data_de) if i not in ltd]
    data_de = temp_de
    temp_en = [l for i, l in enumerate(data_en) if i not in ltd]
    data_en = temp_en
    print(len(data_de),len(data_en))

    # Print 3 random sentence pairs
    ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))
    for i in ix:
        print(f"Zeile: {i}\nDeutsch: {data_de[i]}\nEnglish: {data_en[i]}\n")

    print(f"\nNumber of lines: {len(data_de), len(data_en)}")

    return data_de, data_en

 def count_words(string: str) -> int:
    return len(string.split())


 de, en = load_data()

 # setting the unknown token (e.g. for emojis)
 tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
 tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))

 # adding special tokens
 #   [UNK] : unknown word/token
 #   [CLS] : starting token (new sentence sequence)
 #   [SEP] : separator for chaining multiple sentences
 #   [PAD] : padding needed for encoder input
 trainer = BpeTrainer(vocab_size=10000,
                     special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])

 tokenizer_en.pre_tokenizer = Whitespace()
 tokenizer_de.pre_tokenizer = Whitespace()

 tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
 tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)

 # configure post processing
 tokenizer_en.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[
        ("[SOS]", tokenizer_en.token_to_id("[SOS]")),
        ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
 )
 tokenizer_de.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[
        ("[SOS]", tokenizer_de.token_to_id("[SOS]")),
        ("[EOS]", tokenizer_de.token_to_id("[EOS]")),
    ],
 )

 target_vocab_size = tokenizer_de.get_vocab_size()
 source_vocab_size = tokenizer_en.get_vocab_size()
 ```

 %% Cell type:code id:8edfacb67dc8c527 tags:

 ``` python
 def training_data(source: list[str],
                  target: list[str],
+                  dataset_size: int,
                  batch_size: int = 64,
                  sort: bool = True) -> tuple[torch.tensor, torch.tensor]:
    tokenizer_de.no_padding()
    tokenizer_en.no_padding()

+    if dataset_size > len(source):
+        raise IndexError("Dataset size is larger than the source data")
+
    # sort the training data if true
    if sort:
-        temp = ([list(a) for a in zip(source, target)])
+        temp = ([list(a) for a in zip(source[:dataset_size], target[:dataset_size])])
        temp.sort(key=lambda s: len(s[0]) + len(s[1]))
        source, target = list(zip(*temp))

    # select random sentences
    for i in range(0, len(source) - batch_size, batch_size):
        x_training_data = source[i:i + batch_size]
        y_training_data = target[i:i + batch_size]

        # tokenize data
        tokenizer_en.enable_padding(pad_id=3)
        x_training_data = tokenizer_en.encode_batch(x_training_data)
        tokenizer_de.enable_padding(pad_id=3)
        y_training_data = tokenizer_de.encode_batch(y_training_data)

        # extract ids for every sequence
        for j in range(batch_size):
            x_training_data[j] = x_training_data[j].ids
            y_training_data[j] = y_training_data[j].ids

        # put data into tensor
        x_training_data = torch.tensor(x_training_data)
        y_training_data = torch.tensor(y_training_data)
        # transpose tensors to match input requirements for lstm
        x_training_data = torch.transpose(x_training_data, 0, 1)
        y_training_data = torch.transpose(y_training_data, 0, 1)
        yield x_training_data, y_training_data
 ```

 %% Cell type:code id:524195fe40653308 tags:

 ``` python
 # data test cell
-print(len(de)/64)
-
-for idx, _ in enumerate(training_data(source=de, target=en, batch_size=64)):
+for idx, _ in enumerate(training_data(source=de, target=en, dataset_size=10000, batch_size=64)):
    print(idx)
 ```

 %% Cell type:markdown id:ca6d3d436fd31e33 tags:

 ### Model Definition

 %% Cell type:code id:3b2c4dbc74a1f144 tags:

 ``` python
 # Prepare model
 class Encoder(nn.Module):
    def __init__(self, input_size: int, embedding_size: int,
                 hidden_size: int, num_layers: int, dropout_prob: float):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_prob)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
                           num_layers=num_layers, dropout=dropout_prob)

    def forward(self, x):
        # shape x : (sequence_len, batch_size)
        embedding = self.dropout(self.embedding(x))
        # shape embedding : sequence_len, batch_size, embedding_size)
        output, (hidden, cell) = self.rnn(embedding)
        return hidden, cell

 class Decoder(nn.Module):
    def __init__(self, input_size: int, embedding_size: int,
                 hidden_size: int, num_layers: int, output_size: int,
                 dropout_prob: float):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_prob)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
                           num_layers=num_layers, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
-        x = x.view(1, -1)
+        x = x.reshape(1, -1)
        # shape x : (1, batch_size)
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, batch_size, embedding_size)
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape output : (1, batch_size, hidden_size)
        predictions = self.fc(output)
        # shape predictions : (1, batch_size, vocab_len)
-        predictions = predictions.squeeze(1)
+        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

 class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

-    def forward(self, source, target, teacher_forcing_ratio: float = 0.5):
+    def forward(self, source, target = None, teacher_forcing_ratio: float = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]

        outputs = torch.zeros(target_len, batch_size, target_vocab_size)

        hidden, cell = self.encoder(source)
-
        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

-            best_guess = output.argmax(2)
-
-            x = target[t] if random.random() < teacher_forcing_ratio else best_guess
+            best_guess = output.argmax(1)
+            if target is not None:
+                x = target[t] if random.random() < teacher_forcing_ratio else best_guess
+            else:
+                x = best_guess
        return outputs
 ```

 %% Cell type:markdown id:9854eaee8392caa1 tags:

 ### Model Training

 %% Cell type:code id:ee166d65b3b975d tags:

 ``` python
 # training hyperparameters
-num_epochs = 20
+num_epochs = 3
 learning_rate = 0.001
-batch_size = 64
+batch_size = 128
+dataset_size = 5000

 # model hyperparameters
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 input_size_encoder = source_vocab_size
 input_size_decoder = target_vocab_size
 output_size_decoder = target_vocab_size
 encoder_embedding_size = 300
 decoder_embedding_size = 300
 hidden_size = 1024
 num_layers = 2
-encoder_dropout = 0.5
-decoder_dropout = 0.5
+encoder_dropout = 0.1
+decoder_dropout = 0.1

 encoder_net = Encoder(input_size=input_size_encoder,
                      embedding_size=encoder_embedding_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      dropout_prob=encoder_dropout)

 decoder_net = Decoder(input_size=input_size_decoder,
                      embedding_size=decoder_embedding_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      dropout_prob=decoder_dropout,
                      output_size=output_size_decoder)

 model = Seq2Seq(encoder=encoder_net, decoder=decoder_net)

 criterion = nn.CrossEntropyLoss(ignore_index=3)
 optimizer = optim.Adam(model.parameters(), lr=learning_rate)

 for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch + 1, num_epochs))
+    loss_value = 0

    for batch_idx, (x_train, y_train) in enumerate(training_data(source=en,
-                                                                 target=en)):
+                                                                 target=en,
+                                                                 dataset_size=dataset_size,
+                                                                 batch_size=batch_size)):
+        optimizer.zero_grad()
+
        output = model(x_train, y_train)
+        output_debug = output
        output = output[1:].reshape(-1, output.shape[2])
        y_train = y_train[1:].reshape(-1)

-        optimizer.zero_grad()
        loss = criterion(output, y_train)

        loss.backward()

-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
-        print(batch_idx)
+        loss_value += loss.item()
+
+        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

-    print("loss: " + str(loss.item()))
+    print("loss: " + str(loss_value / (dataset_size / batch_size)))
 ```
+
+%% Output
+
+    Epoch 1/3
+
+%% Cell type:code id:fa8a86342abe0a97 tags:
+
+``` python
+# test the model
+
+test_sentence_en = "This is the first point."
+test_sentence_de = "Das ist der erste Punkt."
+
+test_sentence_en_encoded = tokenizer_en.encode(test_sentence_en)
+test_sentence_de_encoded = tokenizer_de.encode(test_sentence_de)
+
+target_vector = torch.zeros(len(test_sentence_de_encoded.ids), 1)
+
+model.eval()
+x_test = torch.transpose(torch.tensor([test_sentence_en_encoded.ids]), 0, 1)
+y_test = torch.transpose(torch.tensor([test_sentence_de_encoded.ids]), 0, 1)
+print(y_test.shape)
+prediction = model(x_test, y_test, teacher_forcing_ratio=0.0)
+
+logits = torch.nn.functional.softmax(prediction, dim=2)
+
+result_ids = logits.argmax(dim=2)
+
+print(tokenizer_de.decode(list(result_ids)))
+```
+
+%% Output
+
+    torch.Size([8, 1])
+    mischung Tragödie ergänzen ergänzen Mitbürger daß daß