Make functions universal

c415e5c6 · marvnsch · d72a2c64 · c415e5c6 · c415e5c6 · c415e5c6
Commit c415e5c6 authored Jan 19, 2024 by marvnsch
--- a/LSTM_without_attention.ipynb
+++ b/LSTM_without_attention.ipynb
@@ -33,7 +33,40 @@
  },
  {
   "cell_type": "code",
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "219633 219633\n",
+      "Zeile: 48925\n",
+      "Deutsch: Die Sicherheit sollte dabei einen hohen Stellenwert haben.\n",
+      "English: Safety should assume an important place.\n",
+      "\n",
+      "Zeile: 183033\n",
+      "Deutsch: Aber der Bericht Azzolini bringt neue und wertvolle Denkanstöße.\n",
+      "English: But the Azzolini report contributes new and worthwhile approaches.\n",
+      "\n",
+      "Zeile: 210873\n",
+      "Deutsch: Wir sind da also tief in Widersprüche verstrickt.\n",
+      "English: We are therefore overwhelmed with contradictions.\n",
+      "\n",
+      "\n",
+      "Number of lines: (219633, 219633)\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name '__file__' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[7], line 84\u001B[0m\n\u001B[1;32m     73\u001B[0m tokenizer_de\u001B[38;5;241m.\u001B[39mpost_processor \u001B[38;5;241m=\u001B[39m TemplateProcessing(\n\u001B[1;32m     74\u001B[0m     single\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m[SOS] $A [EOS]\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m     75\u001B[0m     special_tokens\u001B[38;5;241m=\u001B[39m[\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m     78\u001B[0m     ],\n\u001B[1;32m     79\u001B[0m )\n\u001B[1;32m     83\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m---> 84\u001B[0m workdir \u001B[38;5;241m=\u001B[39m Path(\u001B[38;5;18;43m__file__\u001B[39;49m)\u001B[38;5;241m.\u001B[39mparent\u001B[38;5;241m.\u001B[39mabsolute()\n\u001B[1;32m     86\u001B[0m tokenizer_de\u001B[38;5;241m.\u001B[39msave(\u001B[38;5;28mstr\u001B[39m(workdir \u001B[38;5;241m/\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtokenizer_de.json\u001B[39m\u001B[38;5;124m\"\u001B[39m))\n\u001B[1;32m     88\u001B[0m target_vocab_size \u001B[38;5;241m=\u001B[39m tokenizer_de\u001B[38;5;241m.\u001B[39mget_vocab_size()\n",
+      "\u001B[0;31mNameError\u001B[0m: name '__file__' is not defined"
+     ]
+    }
+   ],
   "source": [
    "def load_data() -> tuple[list[str], list[str]]:\n",
    "    with open(\"data/training-data/eup/europarl-v7.de-en.de\", \"r\", encoding=\"utf8\") as f:\n",
@@ -115,14 +148,22 @@
    "    ],\n",
    ")\n",
    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
    "target_vocab_size = tokenizer_de.get_vocab_size()\n",
    "source_vocab_size = tokenizer_en.get_vocab_size()"
   ],
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-19T13:27:50.053973Z",
+     "start_time": "2024-01-19T13:27:00.661179Z"
+    }
   },
   "id": "dbc5f26f27746098",
-   "execution_count": null
+   "execution_count": 7
  },
  {
   "cell_type": "code",

 %% Cell type:code id:initial_id tags:

 ``` python
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import random

 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
 from tokenizers.pre_tokenizers import Whitespace
 from tokenizers.processors import TemplateProcessing
 ```

 %% Cell type:markdown id:2b9477923b668978 tags:

 # Data Preparation

 %% Cell type:code id:dbc5f26f27746098 tags:

 ``` python
 def load_data() -> tuple[list[str], list[str]]:
    with open("data/training-data/eup/europarl-v7.de-en.de", "r", encoding="utf8") as f:
        data_de = [line.rstrip("\n") for line in f]
    with open("data/training-data/eup/europarl-v7.de-en.en", "r", encoding="utf8") as f:
        data_en = [line.rstrip("\n") for line in f]

    ltd = set() # save lines to delete later

    for i in range(max(len(data_de), len(data_en))):
        # Move sentence to next line if line is empty other file
        if data_de[i] == "":
            data_en[i+1] = data_en[i] + " " + data_en[i+1]
            ltd.add(i)
        if data_en[i] == "":
            data_de[i+1] = data_de[i] + " " + data_de[i+1]
            ltd.add(i)

        # Remove lines, where difference in words is > 40%
        if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:
            ltd.add(i)

        # Remove lines < 3 words or > 10 words
        if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 10:
            ltd.add(i)

    temp_de = [l for i, l in enumerate(data_de) if i not in ltd]
    data_de = temp_de
    temp_en = [l for i, l in enumerate(data_en) if i not in ltd]
    data_en = temp_en
    print(len(data_de),len(data_en))

    # Print 3 random sentence pairs
    ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))
    for i in ix:
        print(f"Zeile: {i}\nDeutsch: {data_de[i]}\nEnglish: {data_en[i]}\n")

    print(f"\nNumber of lines: {len(data_de), len(data_en)}")

    return data_de, data_en

 def count_words(string: str) -> int:
    return len(string.split())


 de, en = load_data()

 # setting the unknown token (e.g. for emojis)
 tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
 tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))

 # adding special tokens
 #   [UNK] : unknown word/token
 #   [CLS] : starting token (new sentence sequence)
 #   [SEP] : separator for chaining multiple sentences
 #   [PAD] : padding needed for encoder input
 trainer = BpeTrainer(vocab_size=10000,
                     special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])

 tokenizer_en.pre_tokenizer = Whitespace()
 tokenizer_de.pre_tokenizer = Whitespace()

 tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
 tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)

 # configure post processing
 tokenizer_en.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[
        ("[SOS]", tokenizer_en.token_to_id("[SOS]")),
        ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
 )
 tokenizer_de.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[
        ("[SOS]", tokenizer_de.token_to_id("[SOS]")),
        ("[EOS]", tokenizer_de.token_to_id("[EOS]")),
    ],
 )

+
+
+
+
 target_vocab_size = tokenizer_de.get_vocab_size()
 source_vocab_size = tokenizer_en.get_vocab_size()
 ```

+%% Output
+
+    219633 219633
+    Zeile: 48925
+    Deutsch: Die Sicherheit sollte dabei einen hohen Stellenwert haben.
+    English: Safety should assume an important place.
+    
+    Zeile: 183033
+    Deutsch: Aber der Bericht Azzolini bringt neue und wertvolle Denkanstöße.
+    English: But the Azzolini report contributes new and worthwhile approaches.
+    
+    Zeile: 210873
+    Deutsch: Wir sind da also tief in Widersprüche verstrickt.
+    English: We are therefore overwhelmed with contradictions.
+    
+    
+    Number of lines: (219633, 219633)
+
+    ---------------------------------------------------------------------------
+    NameError                                 Traceback (most recent call last)
+Cell     In[7], line 84
+         73 tokenizer_de.post_processor = TemplateProcessing(
+         74     single="[SOS] $A [EOS]",
+         75     special_tokens=[
+       (...)
+         78     ],
+         79 )
+         83 from pathlib import Path
+    ---> 84 workdir = Path(__file__).parent.absolute()
+         86 tokenizer_de.save(str(workdir / "tokenizer_de.json"))
+         88 target_vocab_size = tokenizer_de.get_vocab_size()
+    NameError: name '__file__' is not defined
+
 %% Cell type:code id:dac2a6b0b10d6bdf tags:

 ``` python
 # Define Device
 if torch.cuda.is_available():
    device = torch.device("cuda")
    print("device: cuda")
 elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("device: mps")
 else:
    device = torch.device("cpu")
    print("device: cpu")
 ```

 %% Cell type:code id:8edfacb67dc8c527 tags:

 ``` python
 def training_data(source: list[str],
                  target: list[str],
                  dataset_size: int,
                  batch_size: int = 64,
                  sort: bool = True) -> tuple[torch.tensor, torch.tensor]:
    tokenizer_de.no_padding()
    tokenizer_en.no_padding()

    if dataset_size > len(source):
        raise IndexError("Dataset size is larger than the source data")

    # sort the training data if true
    if sort:
        temp = ([list(a) for a in zip(source[:dataset_size], target[:dataset_size])])
        temp.sort(key=lambda s: len(s[0]) + len(s[1]))
        source, target = list(zip(*temp))

    # select random sentences
    for i in range(0, len(source) - batch_size, batch_size):
        x_training_data = source[i:i + batch_size]
        y_training_data = target[i:i + batch_size]

        # tokenize data
        tokenizer_en.enable_padding(pad_id=3)
        x_training_data = tokenizer_en.encode_batch(x_training_data)
        tokenizer_de.enable_padding(pad_id=3)
        y_training_data = tokenizer_de.encode_batch(y_training_data)

        # extract ids for every sequence
        for j in range(batch_size):
            x_training_data[j] = x_training_data[j].ids
            y_training_data[j] = y_training_data[j].ids

        # put data into tensor
        x_training_data = torch.tensor(x_training_data, device=device)
        y_training_data = torch.tensor(y_training_data, device=device)
        # transpose tensors to match input requirements for lstm
        x_training_data = torch.transpose(x_training_data, 0, 1)
        y_training_data = torch.transpose(y_training_data, 0, 1)
        yield x_training_data, y_training_data
 ```

 %% Cell type:markdown id:ca6d3d436fd31e33 tags:

 ### Model Definition

 %% Cell type:code id:3b2c4dbc74a1f144 tags:

 ``` python
 # Prepare model
 class Encoder(nn.Module):
    def __init__(self, input_size: int, embedding_size: int,
                 hidden_size: int, num_layers: int):
        super(Encoder, self).__init__()
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size, device=device)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
                           num_layers=num_layers, device=device)

    def forward(self, x):
        # shape x : (sequence_len, batch_size)
        embedding = self.embedding(x)
        # shape embedding : sequence_len, batch_size, embedding_size)
        output, (hidden, cell) = self.rnn(embedding)
        return hidden, cell

 class Decoder(nn.Module):
    def __init__(self, input_size: int, embedding_size: int,
                 hidden_size: int, num_layers: int, output_size: int):
        super(Decoder, self).__init__()
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size, device=device)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
                           num_layers=num_layers, device=device)
        self.fc = nn.Linear(hidden_size, output_size, device=device)

    def forward(self, x, hidden, cell):
        x = x.reshape(1, -1)
        # shape x : (1, batch_size)
        embedding = self.embedding(x)
        # embedding shape : (1, batch_size, embedding_size)
        dec_output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape output : (1, batch_size, hidden_size)
        predictions = self.fc(dec_output)
        # shape predictions : (1, batch_size, vocab_len)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

 class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target = None, teacher_forcing_ratio: float = 0.5):
        dec_batch_size = source.shape[1]
        target_len = target.shape[0]

        outputs = torch.zeros(target_len, dec_batch_size, target_vocab_size, device=device)

        hidden, cell = self.encoder(source)
        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

            best_guess = output.argmax(1)
            if target is not None:
                x = target[t] if random.random() < teacher_forcing_ratio else best_guess
            else:
                x = best_guess
        return outputs

 # DEBUG
 # training hyperparameters
 num_epochs = 50
 learning_rate = 0.001
 batch_size = 64
 dataset_size = 100000

 # model hyperparameters
 input_size_encoder = source_vocab_size
 input_size_decoder = target_vocab_size
 output_size_decoder = target_vocab_size
 encoder_embedding_size = 300
 decoder_embedding_size = 300
 model_hidden_size = 1024
 model_num_layers = 2

 encoder_net = Encoder(input_size=input_size_encoder,
                      embedding_size=encoder_embedding_size,
                      hidden_size=model_hidden_size,
                      num_layers=model_num_layers)

 decoder_net = Decoder(input_size=input_size_decoder,
                      embedding_size=decoder_embedding_size,
                      hidden_size=model_hidden_size,
                      num_layers=model_num_layers,
                      output_size=output_size_decoder)

 model = Seq2Seq(encoder=encoder_net, decoder=decoder_net)

 model.train()

 criterion = nn.CrossEntropyLoss()
 optimizer = optim.Adam(model.parameters(), lr=learning_rate)

 for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch + 1, num_epochs))
    loss_value = 0

    for batch_idx, (x_train, y_train) in enumerate(training_data(source=en,
                                                                 target=de,
                                                                 dataset_size=dataset_size,
                                                                 batch_size=batch_size)):
        optimizer.zero_grad()

        predict = model(x_train, y_train)
        predict = predict[1:].reshape(-1, predict.shape[2])
        y_train = y_train[1:].reshape(-1)

        #predict = predict.reshape(-1, predict.shape[2])
        #y_train = y_train.reshape(-1)

        loss = criterion(predict, y_train)

        loss.backward()

        optimizer.step()

        loss_value += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    print("loss: " + str(loss_value / (dataset_size / batch_size)))
 ```

 %% Output

    Epoch 1/50
    loss: 3.251950553474426
    Epoch 2/50
    loss: 2.5886474338912966
    Epoch 3/50
    loss: 2.2590867259407044
    Epoch 4/50
    loss: 1.9914781001472472
    Epoch 5/50
    loss: 1.7620788085746766
    Epoch 6/50
    loss: 1.526157800474167
    Epoch 7/50
    loss: 1.3238139695024491
    Epoch 8/50
    loss: 1.1473209317588806
    Epoch 9/50
    loss: 0.9837613027739525
    Epoch 10/50
    loss: 0.8556473323225975
    Epoch 11/50
    loss: 0.7408420387899876
    Epoch 12/50
    loss: 0.6378299428713322
    Epoch 13/50
    loss: 0.5609761751067638
    Epoch 14/50
    loss: 0.5038776994109154
    Epoch 15/50
    loss: 0.4588668634200096
    Epoch 16/50
    loss: 0.41355706704318523
    Epoch 17/50
    loss: 0.3838378005027771
    Epoch 18/50
    loss: 0.3672341006922722
    Epoch 19/50
    loss: 0.3412713519346714
    Epoch 20/50
    loss: 0.3302113852745295
    Epoch 21/50
    loss: 0.32545686868429186
    Epoch 22/50
    loss: 0.31084906596302986
    Epoch 23/50
    loss: 0.30543883962869645
    Epoch 24/50
    loss: 0.3089831199032068
    Epoch 25/50
    loss: 0.29750473050653936
    Epoch 26/50
    loss: 0.2859128334259987
    Epoch 27/50
    loss: 0.2887848159578443
    Epoch 28/50
    loss: 0.2810410741758347
    Epoch 29/50
    loss: 0.28322928114712237
    Epoch 30/50

 %% Cell type:markdown id:9854eaee8392caa1 tags:

 ### Model Parameters

 %% Cell type:code id:a0d73467f967ecd9 tags:

 ``` python
 from prettytable import PrettyTable

 def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

 count_parameters(model)
 ```

 %% Cell type:markdown id:ea6107f129162137 tags:

 ### Test the model

 %% Cell type:code id:fa8a86342abe0a97 tags:

 ``` python
 # test the model

 test_sentence_en = "Can you reach out to me?"
 test_sentence_de = "Kannst du mit mir in Kontakt treten."

 test_sentence_en_encoded = tokenizer_en.encode(test_sentence_en)
 test_sentence_de_encoded = tokenizer_de.encode(test_sentence_de)

 target_vector = torch.zeros(len(test_sentence_de_encoded.ids), 1)

 model.eval()

 x_test = torch.transpose(torch.tensor([test_sentence_en_encoded.ids], device=device), 0, 1)
 y_test = torch.transpose(torch.tensor([test_sentence_de_encoded.ids], device=device), 0, 1)
 print(y_test.shape)
 prediction = model(x_test, y_test, teacher_forcing_ratio=0.0)

 logits = torch.nn.functional.softmax(prediction, dim=2)

 result_ids = logits.argmax(dim=2)

 print(tokenizer_de.decode(list(result_ids)))
 ```

--- a/data/preprocessing.py
+++ b/data/preprocessing.py
+from pathlib import Path
+
+import torch
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.processors import TemplateProcessing
+
+
+def get_prepared_data(source_data_path: str, target_data_path: str) -> tuple[list[str], list[str]]:
+    with open(source_data_path, "r", encoding="utf8") as f:
+        source_data = [line.rstrip("\n") for line in f]
+    with open(target_data_path, "r", encoding="utf8") as f:
+        target_data = [line.rstrip("\n") for line in f]
+
+    ltd = set() # save lines to delete later
+
+    for i in range(max(len(source_data), len(target_data))):
+        # Move sentence to next line if line is empty other file
+        if source_data[i] == "":
+            target_data[i+1] = target_data[i] + " " + target_data[i+1]
+            ltd.add(i)
+        if target_data[i] == "":
+            source_data[i+1] = source_data[i] + " " + source_data[i+1]
+            ltd.add(i)
+
+        # Remove lines, where difference in words is > 40%
+        if abs(count_words(source_data[i]) - count_words(target_data[i])) / (max(count_words(source_data[i]), count_words(target_data[i])) + 1) > 0.4:
+            ltd.add(i)
+
+        # Remove lines < 3 words or > 10 words
+        if max(count_words(source_data[i]), count_words(target_data[i])) < 3 or max(count_words(source_data[i]), count_words(target_data[i])) > 10:
+            ltd.add(i)
+
+    temp_source = [l for i, l in enumerate(source_data) if i not in ltd]
+    source_data = temp_source
+    temp_target = [l for i, l in enumerate(target_data) if i not in ltd]
+    target_data = temp_target
+    print(len(source_data),len(target_data))
+
+    # Print 3 random sentence pairs
+    ix = torch.randint(low=0, high=max(len(source_data), len(target_data)), size=(3, ))
+    for i in ix:
+        print(f"Zeile: {i}\nDeutsch: {source_data[i]}\nEnglish: {target_data[i]}\n")
+
+    print(f"\nNumber of lines: {len(source_data), len(target_data)}")
+
+    return source_data, target_data
+
+
+def create_tokenizers(source_data_path: str, target_data_path: str, source_language: str, target_language: str):
+    # setting the unknown token (e.g. for emojis)
+    tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
+    tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))
+
+    # adding special tokens
+    #   [UNK] : unknown word/token
+    #   [CLS] : starting token (new sentence sequence)
+    #   [SEP] : separator for chaining multiple sentences
+    #   [PAD] : padding needed for encoder input
+    trainer = BpeTrainer(vocab_size=10000,
+                         special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
+
+    tokenizer_en.pre_tokenizer = Whitespace()
+    tokenizer_de.pre_tokenizer = Whitespace()
+
+    tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
+    tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)
+
+    # configure post processing
+    tokenizer_en.post_processor = TemplateProcessing(
+        single="[SOS] $A [EOS]",
+        special_tokens=[
+            ("[SOS]", tokenizer_en.token_to_id("[SOS]")),
+            ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
+        ],
+    )
+    tokenizer_de.post_processor = TemplateProcessing(
+        single="[SOS] $A [EOS]",
+        special_tokens=[
+            ("[SOS]", tokenizer_de.token_to_id("[SOS]")),
+            ("[EOS]", tokenizer_de.token_to_id("[EOS]")),
+        ],
+    )
+
+    workdir = Path(__file__).parent.absolute()
+    tokenizer_de.save(str(workdir / "tokenizer_de.json"))
+
+
+def count_words(string: str) -> int:
+    return len(string.split())
--- a/models/RNN_no_attention_unidirectional.py
+++ b/models/RNN_no_attention_unidirectional.py
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import random
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.processors import TemplateProcessing
\ No newline at end of file