Skip to content
Snippets Groups Projects
Commit 0a94d40e authored by marvnsch's avatar marvnsch
Browse files

Push changes

parent d1a3b770
No related branches found
No related tags found
No related merge requests found
...@@ -8,7 +8,7 @@ import torch.nn as nn ...@@ -8,7 +8,7 @@ import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import utils.pytorch import utils.pytorch
import utils.training import utils.operation
import data.preprocessing import data.preprocessing
project_root = Path(__file__).parent.parent.parent.absolute() project_root = Path(__file__).parent.parent.parent.absolute()
...@@ -99,7 +99,7 @@ source_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.e ...@@ -99,7 +99,7 @@ source_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.e
target_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.de") target_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.de")
device = utils.pytorch.get_available_device() device = utils.pytorch.get_available_device()
date_time_now = datetime.now().strftime("%m%d%Y_%H%M") date_time_now = datetime.now().strftime("%m%d%Y_%H%M")
model_output_path = str(work_dir / f"./checkpoints/{date_time_now}_RNN_no_attention_unidirectional.pt") model_output_path = str(work_dir / f"./checkpoints/{date_time_now}_RNN_no_attention_bidirectional.pt")
# define hyperparameters # define hyperparameters
vocab_size = 10000 vocab_size = 10000
......
...@@ -8,7 +8,7 @@ import torch.nn as nn ...@@ -8,7 +8,7 @@ import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import utils.pytorch import utils.pytorch
import utils.training import utils.operation
import data.preprocessing import data.preprocessing
project_root = Path(__file__).parent.parent.parent.absolute() project_root = Path(__file__).parent.parent.parent.absolute()
...@@ -114,7 +114,7 @@ model_num_layers = 2 ...@@ -114,7 +114,7 @@ model_num_layers = 2
num_epochs = 50 num_epochs = 50
learning_rate = 0.001 learning_rate = 0.001
batch_size = 64 batch_size = 64
dataset_size = 100000 dataset_size = 10000
train_dev_val_split = (.8, .1, .1) train_dev_val_split = (.8, .1, .1)
train_batches_count = int(train_dev_val_split[0] * dataset_size // batch_size) train_batches_count = int(train_dev_val_split[0] * dataset_size // batch_size)
...@@ -168,7 +168,7 @@ except FileExistsError: ...@@ -168,7 +168,7 @@ except FileExistsError:
# train the model # train the model
utils.training.train(model=model, utils.operation.train(model=model,
data_loader=data_loader, data_loader=data_loader,
criterion=criterion, criterion=criterion,
optimizer=optimizer, optimizer=optimizer,
...@@ -176,3 +176,12 @@ utils.training.train(model=model, ...@@ -176,3 +176,12 @@ utils.training.train(model=model,
num_of_batches_per_epoch=train_batches_count, num_of_batches_per_epoch=train_batches_count,
saving_interval=500, saving_interval=500,
model_output_path=model_output_path) model_output_path=model_output_path)
_, _, val_data_loader = data_loader()
# evaluate the model
utils.operation.evaluate(model=model,
val_data_loader=val_data_loader,
source_tokenizer=source_tokenizer,
target_tokenizer=target_tokenizer,
print_num_examples=10)
...@@ -6,15 +6,40 @@ from datetime import datetime ...@@ -6,15 +6,40 @@ from datetime import datetime
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from tinycss2 import tokenizer
import utils.pytorch import utils.pytorch
import utils.training import utils.operation
import data.preprocessing import data.preprocessing
project_root = Path(__file__).parent.parent.parent.absolute() project_root = Path(__file__).parent.parent.parent.absolute()
work_dir = Path(__file__).parent.absolute() work_dir = Path(__file__).parent.absolute()
class BahdanauAttention(nn.Module):
def __init__(self, hidden_size: int, bidirectional: bool, num_layers: int,
torch_device: torch.device):
super(BahdanauAttention, self).__init__()
self.Wa = nn.Linear(hidden_size * num_layers * 2 if bidirectional
else hidden_size * num_layers,
hidden_size, device=torch_device)
self.Ua = nn.Linear(2 * hidden_size if bidirectional else hidden_size,
hidden_size, device=torch_device)
self.Va = nn.Linear(hidden_size, 1, device=torch_device)
def forward(self, query, keys):
scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
scores = scores.permute(1, 0, 2)
scores = scores.squeeze(2).unsqueeze(1)
weights = torch.nn.functional.softmax(scores, dim=-1)
keys = keys.permute(1, 0, 2)
context = torch.bmm(weights, keys)
return context.permute(1, 0, 2), weights
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, input_size: int, embedding_size: int, def __init__(self, input_size: int, embedding_size: int,
hidden_size: int, num_layers: int, torch_device: torch.device, hidden_size: int, num_layers: int, torch_device: torch.device,
...@@ -31,7 +56,7 @@ class Encoder(nn.Module): ...@@ -31,7 +56,7 @@ class Encoder(nn.Module):
embedding = self.embedding(x) embedding = self.embedding(x)
# shape embedding : sequence_len, batch_size, embedding_size) # shape embedding : sequence_len, batch_size, embedding_size)
output, (hidden, cell) = self.rnn(embedding) output, (hidden, cell) = self.rnn(embedding)
return hidden, cell return output, hidden, cell
class Decoder(nn.Module): class Decoder(nn.Module):
...@@ -40,24 +65,32 @@ class Decoder(nn.Module): ...@@ -40,24 +65,32 @@ class Decoder(nn.Module):
torch_device: torch.device, bidirectional: bool = False): torch_device: torch.device, bidirectional: bool = False):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_layers = num_layers self.num_layers = num_layers
self.lstm_input_size = hidden_size * 2 if bidirectional else hidden_size
self.embedding = nn.Embedding(input_size, embedding_size, device=torch_device) self.embedding = nn.Embedding(input_size, embedding_size, device=torch_device)
self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, self.attention = BahdanauAttention(hidden_size=hidden_size, bidirectional=bidirectional,
torch_device=torch_device, num_layers=num_layers)
self.rnn = nn.LSTM(input_size=embedding_size + self.lstm_input_size,
hidden_size=hidden_size,
num_layers=num_layers, device=torch_device, bidirectional=bidirectional) num_layers=num_layers, device=torch_device, bidirectional=bidirectional)
self.fc = nn.Linear(hidden_size, output_size, device=torch_device) self.fc = nn.Linear(self.lstm_input_size, output_size, device=torch_device)
def forward(self, x, hidden, cell): def forward(self, x, hidden, cell, encoder_outputs):
x = x.reshape(1, -1) x = x.reshape(1, -1)
# shape x : (1, batch_size) # shape x : (1, batch_size)
embedding = self.embedding(x) embedding = self.embedding(x)
# embedding shape : (1, batch_size, embedding_size) # embedding shape : (1, batch_size, embedding_size)
dec_output, (hidden, cell) = self.rnn(embedding, (hidden, cell)) query = hidden.reshape(hidden.shape[1], -1)
context, attn_weights = self.attention(query, encoder_outputs)
contextualized_embedding = torch.cat((embedding, context), dim=2)
# contextualized_embedding shape : (1, batch_size, embedding_size + context_size)
dec_output, (hidden, cell) = self.rnn(contextualized_embedding, (hidden, cell))
# shape output : (1, batch_size, hidden_size) # shape output : (1, batch_size, hidden_size)
predictions = self.fc(dec_output) predictions = self.fc(dec_output)
# shape predictions : (1, batch_size, vocab_len) # shape predictions : (1, batch_size, vocab_len)
predictions = predictions.squeeze(0) predictions = predictions.squeeze(0)
return predictions, hidden, cell return predictions, hidden, cell, attn_weights
class Seq2Seq(nn.Module): class Seq2Seq(nn.Module):
...@@ -76,13 +109,16 @@ class Seq2Seq(nn.Module): ...@@ -76,13 +109,16 @@ class Seq2Seq(nn.Module):
dec_batch_size = source.shape[1] dec_batch_size = source.shape[1]
target_len = target.shape[0] target_len = target.shape[0]
outputs = torch.zeros(target_len, dec_batch_size, self.target_vocab_size, device=self.torch_device) attentions = []
outputs = torch.zeros(target_len, dec_batch_size, self.target_vocab_size,
device=self.torch_device)
hidden, cell = self.encoder(source) enc_out, hidden, cell = self.encoder(source)
x = target[0] x = target[0]
for t in range(1, target_len): for t in range(1, target_len):
output, hidden, cell = self.decoder(x, hidden, cell) output, hidden, cell, attention = self.decoder(x, hidden, cell, enc_out)
outputs[t] = output outputs[t] = output
...@@ -99,22 +135,22 @@ source_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.e ...@@ -99,22 +135,22 @@ source_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.e
target_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.de") target_data_path = str(project_root / "data/tokenizer-data/news-commentary-v11.de")
device = utils.pytorch.get_available_device() device = utils.pytorch.get_available_device()
date_time_now = datetime.now().strftime("%m%d%Y_%H%M") date_time_now = datetime.now().strftime("%m%d%Y_%H%M")
model_output_path = str(work_dir / f"./checkpoints/{date_time_now}_RNN_no_attention_unidirectional.pt") model_output_path = str(work_dir / f"./checkpoints/{date_time_now}_RNN_attention_bidirectional.pt")
# define hyperparameters # define hyperparameters
vocab_size = 10000 vocab_size = 10000
input_size_encoder = vocab_size input_size_encoder = vocab_size
input_size_decoder = vocab_size input_size_decoder = vocab_size
output_size_decoder = vocab_size output_size_decoder = vocab_size
encoder_embedding_size = 300 encoder_embedding_size = 150
decoder_embedding_size = 300 decoder_embedding_size = 150
model_hidden_size = 1024 model_hidden_size = 512
model_num_layers = 2 model_num_layers = 1
num_epochs = 50 num_epochs = 1
learning_rate = 0.001 learning_rate = 0.001
batch_size = 64 batch_size = 64
dataset_size = 100000 dataset_size = 1000
train_dev_val_split = (.8, .1, .1) train_dev_val_split = (.8, .1, .1)
train_batches_count = int(train_dev_val_split[0] * dataset_size // batch_size) train_batches_count = int(train_dev_val_split[0] * dataset_size // batch_size)
...@@ -124,27 +160,31 @@ encoder_net = Encoder(input_size=input_size_encoder, ...@@ -124,27 +160,31 @@ encoder_net = Encoder(input_size=input_size_encoder,
embedding_size=encoder_embedding_size, embedding_size=encoder_embedding_size,
hidden_size=model_hidden_size, hidden_size=model_hidden_size,
num_layers=model_num_layers, num_layers=model_num_layers,
torch_device=device) torch_device=device,
bidirectional=True)
decoder_net = Decoder(input_size=input_size_decoder, decoder_net = Decoder(input_size=input_size_decoder,
embedding_size=decoder_embedding_size, embedding_size=decoder_embedding_size,
hidden_size=model_hidden_size, hidden_size=model_hidden_size,
num_layers=model_num_layers, num_layers=model_num_layers,
output_size=output_size_decoder, output_size=output_size_decoder,
torch_device=device) torch_device=device,
bidirectional=True)
model = Seq2Seq(encoder=encoder_net, decoder=decoder_net, torch_device=device, target_vocab_size=vocab_size) model = Seq2Seq(encoder=encoder_net, decoder=decoder_net, torch_device=device,
target_vocab_size=vocab_size)
model.train() model.train()
# prepare training run # prepare training run
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# get training data # get training data
source_data, target_data = data.preprocessing.get_prepared_data(source_data_path=source_data_path, source_data, target_data = data.preprocessing.get_prepared_data(source_data_path=source_data_path,
target_data_path=target_data_path) target_data_path=target_data_path)
source_tokenizer, target_tokenizer = data.preprocessing.create_tokenizers(source_data_path=source_data_path, source_tokenizer, target_tokenizer = data.preprocessing.create_tokenizers(
source_data_path=source_data_path,
target_data_path=target_data_path, target_data_path=target_data_path,
vocab_size=vocab_size) vocab_size=vocab_size)
...@@ -166,9 +206,8 @@ try: ...@@ -166,9 +206,8 @@ try:
except FileExistsError: except FileExistsError:
pass pass
# train the model # train the model
utils.training.train(model=model, utils.operation.train(model=model,
data_loader=data_loader, data_loader=data_loader,
criterion=criterion, criterion=criterion,
optimizer=optimizer, optimizer=optimizer,
...@@ -176,3 +215,8 @@ utils.training.train(model=model, ...@@ -176,3 +215,8 @@ utils.training.train(model=model,
num_of_batches_per_epoch=train_batches_count, num_of_batches_per_epoch=train_batches_count,
saving_interval=500, saving_interval=500,
model_output_path=model_output_path) model_output_path=model_output_path)
_, _, val_data = data_loader()
...@@ -2,7 +2,8 @@ import torch ...@@ -2,7 +2,8 @@ import torch
import progressbar import progressbar
from utils.pytorch import print_model_parameters from utils.pytorch import print_model_parameters
from torchtext.data.metrics import bleu_score
from tokenizers import Tokenizer
def train(model, data_loader, num_of_batches_per_epoch: int, def train(model, data_loader, num_of_batches_per_epoch: int,
criterion: torch.nn.modules.loss, optimizer: torch.optim, criterion: torch.nn.modules.loss, optimizer: torch.optim,
...@@ -31,15 +32,20 @@ def train(model, data_loader, num_of_batches_per_epoch: int, ...@@ -31,15 +32,20 @@ def train(model, data_loader, num_of_batches_per_epoch: int,
print('----- Epoch {}/{} -----'.format(epoch + 1, num_epochs)) print('----- Epoch {}/{} -----'.format(epoch + 1, num_epochs))
# get data generators # get data generators
train_loader, _, val_loader = data_loader() train_loader, dev_loader, _ = data_loader()
# reset progress bar value # reset progress bar value
progress = 0 progress = 0
with progressbar.ProgressBar(max_value=num_of_batches_per_epoch) as bar: # progressbar widgets
widgets = [progressbar.Counter(format='%(value)3d/%(max_value)d '),
progressbar.PercentageLabelBar(), ' ',
progressbar.Timer(), ' ',
progressbar.Variable('loss', width=2, precision=4)]
with progressbar.ProgressBar(widgets=widgets, max_value=num_of_batches_per_epoch) as bar:
for batch_idx, (x_train, y_train) in enumerate(train_loader): for batch_idx, (x_train, y_train) in enumerate(train_loader):
optimizer.zero_grad() optimizer.zero_grad()
predict = model(x_train, y_train) predict = model(x_train, y_train)
predict = predict[1:].reshape(-1, predict.shape[2]) predict = predict[1:].reshape(-1, predict.shape[2])
y_train = y_train[1:].reshape(-1) y_train = y_train[1:].reshape(-1)
...@@ -57,7 +63,7 @@ def train(model, data_loader, num_of_batches_per_epoch: int, ...@@ -57,7 +63,7 @@ def train(model, data_loader, num_of_batches_per_epoch: int,
torch.save(model.state_dict(), model_output_path) torch.save(model.state_dict(), model_output_path)
# update the progress bar (and the counter) # update the progress bar (and the counter)
bar.update(progress) bar.update(progress, loss=round(loss.item(), 3))
progress += 1 progress += 1
save_counter += 1 save_counter += 1
...@@ -65,16 +71,50 @@ def train(model, data_loader, num_of_batches_per_epoch: int, ...@@ -65,16 +71,50 @@ def train(model, data_loader, num_of_batches_per_epoch: int,
loss_value = 0 loss_value = 0
val_batch_count = 0 val_batch_count = 0
for batch_idx, (x_val, y_val) in enumerate(val_loader): for batch_idx, (x_dev, y_dev) in enumerate(dev_loader):
with torch.no_grad(): with torch.no_grad():
predict = model(x_val, y_val) predict = model(x_dev, y_dev)
predict = predict[1:].reshape(-1, predict.shape[2]) predict = predict[1:].reshape(-1, predict.shape[2])
y_val = y_val[1:].reshape(-1) y_dev = y_dev[1:].reshape(-1)
loss = criterion(predict, y_val) loss = criterion(predict, y_dev)
loss_value += loss.item() loss_value += loss.item()
val_batch_count += 1 val_batch_count += 1
print("loss : " + str(loss_value / val_batch_count)) print("loss : " + str(loss_value / val_batch_count))
# final model save
torch.save(model.state_dict(), str(model_output_path + "h"))
def evaluate(model, val_data_loader,
source_tokenizer: Tokenizer,
target_tokenizer: Tokenizer,
print_num_examples: int = 0):
model.eval()
source_corpus = []
candidate_corpus = []
reference_corpus = []
for x_val, y_val in val_data_loader:
for reference in torch.transpose(y_val, 0, 1):
reference_corpus.append(target_tokenizer.decode(list(reference)))
for reference in torch.transpose(x_val, 0, 1):
source_corpus.append(source_tokenizer.decode(list(reference)))
predictions = model(x_val, y_val)
logits = torch.nn.functional.softmax(predictions, dim=2)
result_ids = logits.argmax(dim=2)
for sentence in torch.transpose(result_ids, 0, 1):
candidate_corpus.append(target_tokenizer.decode(list(sentence)))
print_num_examples = min(len(candidate_corpus), print_num_examples)
for idx in torch.randint(low=0, high=len(candidate_corpus), size=(print_num_examples,)):
print(f"> {source_corpus[idx]}")
print(f"= {reference_corpus[idx]}")
print(f"< {candidate_corpus[idx]}")
print(f"BLEU score result: {bleu_score(candidate_corpus, reference_corpus)}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment