Skip to content
Snippets Groups Projects
Commit c415e5c6 authored by marvnsch's avatar marvnsch
Browse files

Make functions universal

parent d72a2c64
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:initial_id tags:
``` python
import torch
import torch.nn as nn
import torch.optim as optim
import random
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
```
%% Cell type:markdown id:2b9477923b668978 tags:
# Data Preparation
%% Cell type:code id:dbc5f26f27746098 tags:
``` python
def load_data() -> tuple[list[str], list[str]]:
with open("data/training-data/eup/europarl-v7.de-en.de", "r", encoding="utf8") as f:
data_de = [line.rstrip("\n") for line in f]
with open("data/training-data/eup/europarl-v7.de-en.en", "r", encoding="utf8") as f:
data_en = [line.rstrip("\n") for line in f]
ltd = set() # save lines to delete later
for i in range(max(len(data_de), len(data_en))):
# Move sentence to next line if line is empty other file
if data_de[i] == "":
data_en[i+1] = data_en[i] + " " + data_en[i+1]
ltd.add(i)
if data_en[i] == "":
data_de[i+1] = data_de[i] + " " + data_de[i+1]
ltd.add(i)
# Remove lines, where difference in words is > 40%
if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:
ltd.add(i)
# Remove lines < 3 words or > 10 words
if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 10:
ltd.add(i)
temp_de = [l for i, l in enumerate(data_de) if i not in ltd]
data_de = temp_de
temp_en = [l for i, l in enumerate(data_en) if i not in ltd]
data_en = temp_en
print(len(data_de),len(data_en))
# Print 3 random sentence pairs
ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))
for i in ix:
print(f"Zeile: {i}\nDeutsch: {data_de[i]}\nEnglish: {data_en[i]}\n")
print(f"\nNumber of lines: {len(data_de), len(data_en)}")
return data_de, data_en
def count_words(string: str) -> int:
return len(string.split())
de, en = load_data()
# setting the unknown token (e.g. for emojis)
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer = BpeTrainer(vocab_size=10000,
special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_de.pre_tokenizer = Whitespace()
tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)
# configure post processing
tokenizer_en.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_en.token_to_id("[SOS]")),
("[EOS]", tokenizer_en.token_to_id("[EOS]")),
],
)
tokenizer_de.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_de.token_to_id("[SOS]")),
("[EOS]", tokenizer_de.token_to_id("[EOS]")),
],
)
target_vocab_size = tokenizer_de.get_vocab_size()
source_vocab_size = tokenizer_en.get_vocab_size()
```
%% Output
219633 219633
Zeile: 48925
Deutsch: Die Sicherheit sollte dabei einen hohen Stellenwert haben.
English: Safety should assume an important place.
Zeile: 183033
Deutsch: Aber der Bericht Azzolini bringt neue und wertvolle Denkanstöße.
English: But the Azzolini report contributes new and worthwhile approaches.
Zeile: 210873
Deutsch: Wir sind da also tief in Widersprüche verstrickt.
English: We are therefore overwhelmed with contradictions.
Number of lines: (219633, 219633)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 84
73 tokenizer_de.post_processor = TemplateProcessing(
74 single="[SOS] $A [EOS]",
75 special_tokens=[
(...)
78 ],
79 )
83 from pathlib import Path
---> 84 workdir = Path(__file__).parent.absolute()
86 tokenizer_de.save(str(workdir / "tokenizer_de.json"))
88 target_vocab_size = tokenizer_de.get_vocab_size()
NameError: name '__file__' is not defined
%% Cell type:code id:dac2a6b0b10d6bdf tags:
``` python
# Define Device
if torch.cuda.is_available():
device = torch.device("cuda")
print("device: cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
print("device: mps")
else:
device = torch.device("cpu")
print("device: cpu")
```
%% Cell type:code id:8edfacb67dc8c527 tags:
``` python
def training_data(source: list[str],
target: list[str],
dataset_size: int,
batch_size: int = 64,
sort: bool = True) -> tuple[torch.tensor, torch.tensor]:
tokenizer_de.no_padding()
tokenizer_en.no_padding()
if dataset_size > len(source):
raise IndexError("Dataset size is larger than the source data")
# sort the training data if true
if sort:
temp = ([list(a) for a in zip(source[:dataset_size], target[:dataset_size])])
temp.sort(key=lambda s: len(s[0]) + len(s[1]))
source, target = list(zip(*temp))
# select random sentences
for i in range(0, len(source) - batch_size, batch_size):
x_training_data = source[i:i + batch_size]
y_training_data = target[i:i + batch_size]
# tokenize data
tokenizer_en.enable_padding(pad_id=3)
x_training_data = tokenizer_en.encode_batch(x_training_data)
tokenizer_de.enable_padding(pad_id=3)
y_training_data = tokenizer_de.encode_batch(y_training_data)
# extract ids for every sequence
for j in range(batch_size):
x_training_data[j] = x_training_data[j].ids
y_training_data[j] = y_training_data[j].ids
# put data into tensor
x_training_data = torch.tensor(x_training_data, device=device)
y_training_data = torch.tensor(y_training_data, device=device)
# transpose tensors to match input requirements for lstm
x_training_data = torch.transpose(x_training_data, 0, 1)
y_training_data = torch.transpose(y_training_data, 0, 1)
yield x_training_data, y_training_data
```
%% Cell type:markdown id:ca6d3d436fd31e33 tags:
### Model Definition
%% Cell type:code id:3b2c4dbc74a1f144 tags:
``` python
# Prepare model
class Encoder(nn.Module):
def __init__(self, input_size: int, embedding_size: int,
hidden_size: int, num_layers: int):
super(Encoder, self).__init__()
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size, device=device)
self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
num_layers=num_layers, device=device)
def forward(self, x):
# shape x : (sequence_len, batch_size)
embedding = self.embedding(x)
# shape embedding : sequence_len, batch_size, embedding_size)
output, (hidden, cell) = self.rnn(embedding)
return hidden, cell
class Decoder(nn.Module):
def __init__(self, input_size: int, embedding_size: int,
hidden_size: int, num_layers: int, output_size: int):
super(Decoder, self).__init__()
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size, device=device)
self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size,
num_layers=num_layers, device=device)
self.fc = nn.Linear(hidden_size, output_size, device=device)
def forward(self, x, hidden, cell):
x = x.reshape(1, -1)
# shape x : (1, batch_size)
embedding = self.embedding(x)
# embedding shape : (1, batch_size, embedding_size)
dec_output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
# shape output : (1, batch_size, hidden_size)
predictions = self.fc(dec_output)
# shape predictions : (1, batch_size, vocab_len)
predictions = predictions.squeeze(0)
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder: Encoder, decoder: Decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target = None, teacher_forcing_ratio: float = 0.5):
dec_batch_size = source.shape[1]
target_len = target.shape[0]
outputs = torch.zeros(target_len, dec_batch_size, target_vocab_size, device=device)
hidden, cell = self.encoder(source)
x = target[0]
for t in range(1, target_len):
output, hidden, cell = self.decoder(x, hidden, cell)
outputs[t] = output
best_guess = output.argmax(1)
if target is not None:
x = target[t] if random.random() < teacher_forcing_ratio else best_guess
else:
x = best_guess
return outputs
# DEBUG
# training hyperparameters
num_epochs = 50
learning_rate = 0.001
batch_size = 64
dataset_size = 100000
# model hyperparameters
input_size_encoder = source_vocab_size
input_size_decoder = target_vocab_size
output_size_decoder = target_vocab_size
encoder_embedding_size = 300
decoder_embedding_size = 300
model_hidden_size = 1024
model_num_layers = 2
encoder_net = Encoder(input_size=input_size_encoder,
embedding_size=encoder_embedding_size,
hidden_size=model_hidden_size,
num_layers=model_num_layers)
decoder_net = Decoder(input_size=input_size_decoder,
embedding_size=decoder_embedding_size,
hidden_size=model_hidden_size,
num_layers=model_num_layers,
output_size=output_size_decoder)
model = Seq2Seq(encoder=encoder_net, decoder=decoder_net)
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch + 1, num_epochs))
loss_value = 0
for batch_idx, (x_train, y_train) in enumerate(training_data(source=en,
target=de,
dataset_size=dataset_size,
batch_size=batch_size)):
optimizer.zero_grad()
predict = model(x_train, y_train)
predict = predict[1:].reshape(-1, predict.shape[2])
y_train = y_train[1:].reshape(-1)
#predict = predict.reshape(-1, predict.shape[2])
#y_train = y_train.reshape(-1)
loss = criterion(predict, y_train)
loss.backward()
optimizer.step()
loss_value += loss.item()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
print("loss: " + str(loss_value / (dataset_size / batch_size)))
```
%% Output
Epoch 1/50
loss: 3.251950553474426
Epoch 2/50
loss: 2.5886474338912966
Epoch 3/50
loss: 2.2590867259407044
Epoch 4/50
loss: 1.9914781001472472
Epoch 5/50
loss: 1.7620788085746766
Epoch 6/50
loss: 1.526157800474167
Epoch 7/50
loss: 1.3238139695024491
Epoch 8/50
loss: 1.1473209317588806
Epoch 9/50
loss: 0.9837613027739525
Epoch 10/50
loss: 0.8556473323225975
Epoch 11/50
loss: 0.7408420387899876
Epoch 12/50
loss: 0.6378299428713322
Epoch 13/50
loss: 0.5609761751067638
Epoch 14/50
loss: 0.5038776994109154
Epoch 15/50
loss: 0.4588668634200096
Epoch 16/50
loss: 0.41355706704318523
Epoch 17/50
loss: 0.3838378005027771
Epoch 18/50
loss: 0.3672341006922722
Epoch 19/50
loss: 0.3412713519346714
Epoch 20/50
loss: 0.3302113852745295
Epoch 21/50
loss: 0.32545686868429186
Epoch 22/50
loss: 0.31084906596302986
Epoch 23/50
loss: 0.30543883962869645
Epoch 24/50
loss: 0.3089831199032068
Epoch 25/50
loss: 0.29750473050653936
Epoch 26/50
loss: 0.2859128334259987
Epoch 27/50
loss: 0.2887848159578443
Epoch 28/50
loss: 0.2810410741758347
Epoch 29/50
loss: 0.28322928114712237
Epoch 30/50
%% Cell type:markdown id:9854eaee8392caa1 tags:
### Model Parameters
%% Cell type:code id:a0d73467f967ecd9 tags:
``` python
from prettytable import PrettyTable
def count_parameters(model):
table = PrettyTable(["Modules", "Parameters"])
total_params = 0
for name, parameter in model.named_parameters():
if not parameter.requires_grad:
continue
params = parameter.numel()
table.add_row([name, params])
total_params += params
print(table)
print(f"Total Trainable Params: {total_params}")
return total_params
count_parameters(model)
```
%% Cell type:markdown id:ea6107f129162137 tags:
### Test the model
%% Cell type:code id:fa8a86342abe0a97 tags:
``` python
# test the model
test_sentence_en = "Can you reach out to me?"
test_sentence_de = "Kannst du mit mir in Kontakt treten."
test_sentence_en_encoded = tokenizer_en.encode(test_sentence_en)
test_sentence_de_encoded = tokenizer_de.encode(test_sentence_de)
target_vector = torch.zeros(len(test_sentence_de_encoded.ids), 1)
model.eval()
x_test = torch.transpose(torch.tensor([test_sentence_en_encoded.ids], device=device), 0, 1)
y_test = torch.transpose(torch.tensor([test_sentence_de_encoded.ids], device=device), 0, 1)
print(y_test.shape)
prediction = model(x_test, y_test, teacher_forcing_ratio=0.0)
logits = torch.nn.functional.softmax(prediction, dim=2)
result_ids = logits.argmax(dim=2)
print(tokenizer_de.decode(list(result_ids)))
```
......
from pathlib import Path
import torch
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
def get_prepared_data(source_data_path: str, target_data_path: str) -> tuple[list[str], list[str]]:
with open(source_data_path, "r", encoding="utf8") as f:
source_data = [line.rstrip("\n") for line in f]
with open(target_data_path, "r", encoding="utf8") as f:
target_data = [line.rstrip("\n") for line in f]
ltd = set() # save lines to delete later
for i in range(max(len(source_data), len(target_data))):
# Move sentence to next line if line is empty other file
if source_data[i] == "":
target_data[i+1] = target_data[i] + " " + target_data[i+1]
ltd.add(i)
if target_data[i] == "":
source_data[i+1] = source_data[i] + " " + source_data[i+1]
ltd.add(i)
# Remove lines, where difference in words is > 40%
if abs(count_words(source_data[i]) - count_words(target_data[i])) / (max(count_words(source_data[i]), count_words(target_data[i])) + 1) > 0.4:
ltd.add(i)
# Remove lines < 3 words or > 10 words
if max(count_words(source_data[i]), count_words(target_data[i])) < 3 or max(count_words(source_data[i]), count_words(target_data[i])) > 10:
ltd.add(i)
temp_source = [l for i, l in enumerate(source_data) if i not in ltd]
source_data = temp_source
temp_target = [l for i, l in enumerate(target_data) if i not in ltd]
target_data = temp_target
print(len(source_data),len(target_data))
# Print 3 random sentence pairs
ix = torch.randint(low=0, high=max(len(source_data), len(target_data)), size=(3, ))
for i in ix:
print(f"Zeile: {i}\nDeutsch: {source_data[i]}\nEnglish: {target_data[i]}\n")
print(f"\nNumber of lines: {len(source_data), len(target_data)}")
return source_data, target_data
def create_tokenizers(source_data_path: str, target_data_path: str, source_language: str, target_language: str):
# setting the unknown token (e.g. for emojis)
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer = BpeTrainer(vocab_size=10000,
special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_de.pre_tokenizer = Whitespace()
tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)
# configure post processing
tokenizer_en.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_en.token_to_id("[SOS]")),
("[EOS]", tokenizer_en.token_to_id("[EOS]")),
],
)
tokenizer_de.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_de.token_to_id("[SOS]")),
("[EOS]", tokenizer_de.token_to_id("[EOS]")),
],
)
workdir = Path(__file__).parent.absolute()
tokenizer_de.save(str(workdir / "tokenizer_de.json"))
def count_words(string: str) -> int:
return len(string.split())
import torch
import torch.nn as nn
import torch.optim as optim
import random
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment