Skip to content
Snippets Groups Projects
Commit d5779d6a authored by marvnsch's avatar marvnsch
Browse files

Add some stuff :)

parent 0dfe5c89
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,8 @@ from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
work_dir = Path(__file__).parent.absolute()
def get_prepared_data(source_data_path: str,
target_data_path: str,
......@@ -51,7 +53,6 @@ def get_prepared_data(source_data_path: str,
source_data = temp_source
temp_target = [l for i, l in enumerate(target_data) if i not in ltd]
target_data = temp_target
print(len(source_data),len(target_data))
if debug:
# Print 3 random sentence pairs
......@@ -67,44 +68,63 @@ def count_words(string: str) -> int:
return len(string.split())
def create_tokenizers(source_data_path: str, target_data_path: str, vocab_size: int, special_tokens=None):
# define tokenizer names
tokenizer_source_save_name = (f"tk-{source_data_path.split('/')[-1].replace('.', '-')}-"
f"sptk-{special_tokens if special_tokens is None else ''.join(special_tokens)}-"
f"vcsz-{str(vocab_size)}")
tokenizer_target_save_name = (f"tk-{target_data_path.split('/')[-1].replace('.', '-')}-"
f"sptk-{special_tokens if special_tokens is None else ''.join(special_tokens)}-"
f"vcsz-{str(vocab_size)}")
# setting the special tokens & checking for [UNK] token
if special_tokens is None:
special_tokens = ["[UNK]", "[SOS]", "[EOS]", "[PAD]"]
elif "[UNK]" not in special_tokens:
raise ValueError("The [UNK] token is required and was not found within the special token list!")
def create_tokenizers(source_data_path: str, target_data_path: str, source_language: str, target_language: str):
# setting the unknown token (e.g. for emojis)
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_de = Tokenizer(BPE(unk_token="[UNK]"))
# check if tokenizer already exists (saved version)
try:
tokenizer_source = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_target = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_source.from_file(str(work_dir / f"tokenizer/{tokenizer_source_save_name}"))
tokenizer_target.from_file(str(work_dir / f"tokenizer/{tokenizer_target_save_name}"))
return tokenizer_source, tokenizer_target
except:
print("No matching tokenizer found on disk - A new tokenizer will be trained (and saved to disk)")
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer = BpeTrainer(vocab_size=10000,
special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
tokenizer_source = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_target = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_de.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=vocab_size,
special_tokens=special_tokens)
tokenizer_en.train(["data/training-data/eup/europarl-v7.de-en.en"], trainer)
tokenizer_de.train(["data/training-data/eup/europarl-v7.de-en.de"], trainer)
tokenizer_source.pre_tokenizer = Whitespace()
tokenizer_target.pre_tokenizer = Whitespace()
tokenizer_source.train([source_data_path], trainer)
tokenizer_target.train([target_data_path], trainer)
# configure post processing
tokenizer_en.post_processor = TemplateProcessing(
tokenizer_source.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_en.token_to_id("[SOS]")),
("[EOS]", tokenizer_en.token_to_id("[EOS]")),
("[SOS]", tokenizer_source.token_to_id("[SOS]")),
("[EOS]", tokenizer_source.token_to_id("[EOS]")),
],
)
tokenizer_de.post_processor = TemplateProcessing(
tokenizer_target.post_processor = TemplateProcessing(
single="[SOS] $A [EOS]",
special_tokens=[
("[SOS]", tokenizer_de.token_to_id("[SOS]")),
("[EOS]", tokenizer_de.token_to_id("[EOS]")),
("[SOS]", tokenizer_target.token_to_id("[SOS]")),
("[EOS]", tokenizer_target.token_to_id("[EOS]")),
],
)
workdir = Path(__file__).parent.absolute()
tokenizer_de.save(str(workdir / "tokenizer_de.json"))
tokenizer_source.save(str(work_dir / f"tokenizer/{tokenizer_source_save_name}"))
tokenizer_target.save(str(work_dir / f"tokenizer/{tokenizer_target_save_name}"))
return tokenizer_source, tokenizer_target
def data_loader(source: list[str],
......@@ -118,7 +138,7 @@ def data_loader(source: list[str],
sort: bool = True):
if sum(data_split) != 1.0:
raise ValueError(f"The data split must add up to one")
raise ValueError("The data split must add up to one")
if dataset_size > len(source):
raise IndexError("Dataset size is larger than the source data")
......
......@@ -8,6 +8,7 @@ import torch.optim as optim
import utils.pytorch
import utils.training
import data.preprocessing
project_root = Path(__file__).parent.parent.parent.absolute()
work_dir = Path(__file__).parent.absolute()
......@@ -92,8 +93,8 @@ class Seq2Seq(nn.Module):
# setup environment
source_data_path = str(project_root / "data/training-data/news-commentary-v11.en")
target_data_path = str(project_root / "data/training-data/news-commentary-v11.de")
source_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.en")
target_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.de")
device = utils.pytorch.get_available_device()
date_time_now = datetime.now().strftime("%m%d%Y_%H%M")
model_output_path = str(work_dir / f"./checkpoints/{date_time_now}RNN_no_attention_unidirectional")
......@@ -108,10 +109,10 @@ decoder_embedding_size = 300
model_hidden_size = 1024
model_num_layers = 2
num_epochs = 50
num_epochs = 1
learning_rate = 0.001
batch_size = 64
dataset_size = 100000
dataset_size = 1000
# create model
encoder_net = Encoder(input_size=input_size_encoder,
......@@ -136,11 +137,23 @@ criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# get training data
source_data, target_data = data.preprocessing.get_prepared_data(source_data_path=source_data_path,
target_data_path=target_data_path)
source_tokenizer, target_tokenizer = data.preprocessing.create_tokenizers(source_data_path=source_data_path,
target_data_path=target_data_path,
vocab_size=vocab_size)
training_loader, develop_loader, test_loader = data.preprocessing.data_loader(source=source_data,
target=target_data,
batch_size=batch_size,
source_tokenizer=source_tokenizer,
target_tokenizer=target_tokenizer,
dataset_size=dataset_size,
torch_device=device)
# train the model
utils.training.train(model=model,
train_loader=training_loader,
val_loader=validation_loader,
val_loader=test_loader,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment