Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
nlp-machine-translation-project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marvin Schaefers
nlp-machine-translation-project
Commits
f6b60e06
There was an error fetching the commit references. Please try again later.
Commit
f6b60e06
authored
Jan 14, 2024
by
marvnsch
Browse files
Options
Downloads
Patches
Plain Diff
Add copy cat notebook
parent
17adcf9e
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
copycat.ipynb
+383
-0
383 additions, 0 deletions
copycat.ipynb
with
383 additions
and
0 deletions
copycat.ipynb
0 → 100644
+
383
−
0
View file @
f6b60e06
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import random\n",
"\n",
"from tokenizers import Tokenizer\n",
"from tokenizers.models import BPE\n",
"from tokenizers.trainers import BpeTrainer\n",
"from tokenizers.pre_tokenizers import Whitespace\n",
"from tokenizers.processors import TemplateProcessing"
]
},
{
"cell_type": "markdown",
"source": [
"# Data Preparation"
],
"metadata": {
"collapsed": false
},
"id": "2b9477923b668978"
},
{
"cell_type": "code",
"outputs": [],
"source": [
"def load_data() -> tuple[list[str], list[str]]:\n",
" with open(\"data/training-data/eup/europarl-v7.de-en.de\", \"r\", encoding=\"utf8\") as f:\n",
" data_de = [line.rstrip(\"\\n\") for line in f]\n",
" with open(\"data/training-data/eup/europarl-v7.de-en.en\", \"r\", encoding=\"utf8\") as f:\n",
" data_en = [line.rstrip(\"\\n\") for line in f]\n",
"\n",
" ltd = set() # save lines to delete later\n",
"\n",
" for i in range(max(len(data_de), len(data_en))):\n",
" # Move sentence to next line if line is empty other file\n",
" if data_de[i] == \"\":\n",
" data_en[i+1] = data_en[i] + \" \" + data_en[i+1]\n",
" ltd.add(i)\n",
" if data_en[i] == \"\":\n",
" data_de[i+1] = data_de[i] + \" \" + data_de[i+1]\n",
" ltd.add(i)\n",
"\n",
" # Remove lines, where difference in words is > 40%\n",
" if abs(count_words(data_de[i]) - count_words(data_en[i])) / (max(count_words(data_de[i]), count_words(data_en[i])) + 1) > 0.4:\n",
" ltd.add(i)\n",
"\n",
" # Remove lines < 3 words or > 10 words\n",
" if max(count_words(data_de[i]), count_words(data_en[i])) < 3 or max(count_words(data_de[i]), count_words(data_en[i])) > 10:\n",
" ltd.add(i)\n",
"\n",
" temp_de = [l for i, l in enumerate(data_de) if i not in ltd]\n",
" data_de = temp_de\n",
" temp_en = [l for i, l in enumerate(data_en) if i not in ltd]\n",
" data_en = temp_en\n",
" print(len(data_de),len(data_en))\n",
"\n",
" # Print 3 random sentence pairs\n",
" ix = torch.randint(low=0, high=max(len(data_de), len(data_en)), size=(3, ))\n",
" for i in ix:\n",
" print(f\"Zeile: {i}\\nDeutsch: {data_de[i]}\\nEnglish: {data_en[i]}\\n\")\n",
"\n",
" print(f\"\\nNumber of lines: {len(data_de), len(data_en)}\")\n",
"\n",
" return data_de, data_en\n",
"\n",
"def count_words(string: str) -> int:\n",
" return len(string.split())\n",
"\n",
"\n",
"de, en = load_data()\n",
"\n",
"# setting the unknown token (e.g. for emojis)\n",
"tokenizer_en = Tokenizer(BPE(unk_token=\"[UNK]\"))\n",
"tokenizer_de = Tokenizer(BPE(unk_token=\"[UNK]\"))\n",
"\n",
"# adding special tokens\n",
"# [UNK] : unknown word/token\n",
"# [CLS] : starting token (new sentence sequence)\n",
"# [SEP] : separator for chaining multiple sentences\n",
"# [PAD] : padding needed for encoder input\n",
"trainer = BpeTrainer(vocab_size=10000, \n",
" special_tokens=[\"[UNK]\", \"[SOS]\", \"[EOS]\", \"[PAD]\"])\n",
"\n",
"tokenizer_en.pre_tokenizer = Whitespace()\n",
"tokenizer_de.pre_tokenizer = Whitespace()\n",
"\n",
"tokenizer_en.train([\"data/training-data/eup/europarl-v7.de-en.en\"], trainer)\n",
"tokenizer_de.train([\"data/training-data/eup/europarl-v7.de-en.de\"], trainer)\n",
"\n",
"# configure post processing\n",
"tokenizer_en.post_processor = TemplateProcessing(\n",
" single=\"[SOS] $A [EOS]\",\n",
" special_tokens=[\n",
" (\"[SOS]\", tokenizer_en.token_to_id(\"[SOS]\")),\n",
" (\"[EOS]\", tokenizer_en.token_to_id(\"[EOS]\")),\n",
" ],\n",
")\n",
"tokenizer_de.post_processor = TemplateProcessing(\n",
" single=\"[SOS] $A [EOS]\",\n",
" special_tokens=[\n",
" (\"[SOS]\", tokenizer_de.token_to_id(\"[SOS]\")),\n",
" (\"[EOS]\", tokenizer_de.token_to_id(\"[EOS]\")),\n",
" ],\n",
")\n",
"\n",
"target_vocab_size = tokenizer_de.get_vocab_size()\n",
"source_vocab_size = tokenizer_en.get_vocab_size()"
],
"metadata": {
"collapsed": false
},
"id": "dbc5f26f27746098",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"def training_data(source: list[str],\n",
" target: list[str],\n",
" batch_size: int = 64,\n",
" sort: bool = True) -> tuple[torch.tensor, torch.tensor]:\n",
" tokenizer_de.no_padding()\n",
" tokenizer_en.no_padding()\n",
" \n",
" # sort the training data if true\n",
" if sort:\n",
" temp = ([list(a) for a in zip(source, target)])\n",
" temp.sort(key=lambda s: len(s[0]) + len(s[1]))\n",
" source, target = list(zip(*temp))\n",
"\n",
" # select random sentences\n",
" for i in range(0, len(source) - batch_size, batch_size):\n",
" x_training_data = source[i:i + batch_size]\n",
" y_training_data = target[i:i + batch_size]\n",
"\n",
" # tokenize data\n",
" tokenizer_en.enable_padding(pad_id=3)\n",
" x_training_data = tokenizer_en.encode_batch(x_training_data)\n",
" tokenizer_de.enable_padding(pad_id=3)\n",
" y_training_data = tokenizer_de.encode_batch(y_training_data)\n",
"\n",
" # extract ids for every sequence\n",
" for j in range(batch_size):\n",
" x_training_data[j] = x_training_data[j].ids\n",
" y_training_data[j] = y_training_data[j].ids\n",
"\n",
" # put data into tensor\n",
" x_training_data = torch.tensor(x_training_data)\n",
" y_training_data = torch.tensor(y_training_data)\n",
" # transpose tensors to match input requirements for lstm\n",
" x_training_data = torch.transpose(x_training_data, 0, 1)\n",
" y_training_data = torch.transpose(y_training_data, 0, 1)\n",
" yield x_training_data, y_training_data"
],
"metadata": {
"collapsed": false
},
"id": "8edfacb67dc8c527",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# data test cell\n",
"print(len(de)/64)\n",
"\n",
"for idx, _ in enumerate(training_data(source=de, target=en, batch_size=64)):\n",
" print(idx)"
],
"metadata": {
"collapsed": false
},
"id": "524195fe40653308",
"execution_count": null
},
{
"cell_type": "markdown",
"source": [
"### Model Definition"
],
"metadata": {
"collapsed": false
},
"id": "ca6d3d436fd31e33"
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# Prepare model\n",
"class Encoder(nn.Module):\n",
" def __init__(self, input_size: int, embedding_size: int, \n",
" hidden_size: int, num_layers: int, dropout_prob: float):\n",
" super(Encoder, self).__init__()\n",
" self.hidden_size = hidden_size\n",
" self.num_layers = num_layers\n",
" \n",
" self.dropout = nn.Dropout(dropout_prob)\n",
" self.embedding = nn.Embedding(input_size, embedding_size)\n",
" self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, \n",
" num_layers=num_layers, dropout=dropout_prob)\n",
" \n",
" def forward(self, x):\n",
" # shape x : (sequence_len, batch_size)\n",
" embedding = self.dropout(self.embedding(x))\n",
" # shape embedding : sequence_len, batch_size, embedding_size)\n",
" output, (hidden, cell) = self.rnn(embedding)\n",
" return hidden, cell\n",
" \n",
"class Decoder(nn.Module):\n",
" def __init__(self, input_size: int, embedding_size: int, \n",
" hidden_size: int, num_layers: int, output_size: int, \n",
" dropout_prob: float):\n",
" super(Decoder, self).__init__()\n",
" self.hidden_size = hidden_size\n",
" self.num_layers = num_layers\n",
" \n",
" self.dropout = nn.Dropout(dropout_prob)\n",
" self.embedding = nn.Embedding(input_size, embedding_size)\n",
" self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, \n",
" num_layers=num_layers, dropout=dropout_prob)\n",
" self.fc = nn.Linear(hidden_size, output_size)\n",
" \n",
" def forward(self, x, hidden, cell):\n",
" x = x.view(1, -1)\n",
" # shape x : (1, batch_size)\n",
" embedding = self.dropout(self.embedding(x))\n",
" # embedding shape : (1, batch_size, embedding_size)\n",
" output, (hidden, cell) = self.rnn(embedding, (hidden, cell))\n",
" # shape output : (1, batch_size, hidden_size)\n",
" predictions = self.fc(output)\n",
" # shape predictions : (1, batch_size, vocab_len)\n",
" predictions = predictions.squeeze(1)\n",
" \n",
" return predictions, hidden, cell\n",
" \n",
"class Seq2Seq(nn.Module):\n",
" def __init__(self, encoder: Encoder, decoder: Decoder):\n",
" super(Seq2Seq, self).__init__()\n",
" self.encoder = encoder\n",
" self.decoder = decoder\n",
" \n",
" def forward(self, source, target, teacher_forcing_ratio: float = 0.5):\n",
" batch_size = source.shape[1]\n",
" target_len = target.shape[0]\n",
" \n",
" outputs = torch.zeros(target_len, batch_size, target_vocab_size)\n",
" \n",
" hidden, cell = self.encoder(source)\n",
" \n",
" x = target[0]\n",
" \n",
" for t in range(1, target_len):\n",
" output, hidden, cell = self.decoder(x, hidden, cell)\n",
" \n",
" outputs[t] = output\n",
" \n",
" best_guess = output.argmax(2)\n",
" \n",
" x = target[t] if random.random() < teacher_forcing_ratio else best_guess\n",
" return outputs"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-14T22:07:32.406911Z",
"start_time": "2024-01-14T22:07:32.401944Z"
}
},
"id": "3b2c4dbc74a1f144",
"execution_count": 129
},
{
"cell_type": "markdown",
"source": [
"### Model Training"
],
"metadata": {
"collapsed": false
},
"id": "9854eaee8392caa1"
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# training hyperparameters\n",
"num_epochs = 20\n",
"learning_rate = 0.001\n",
"batch_size = 64\n",
"\n",
"# model hyperparameters\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"input_size_encoder = source_vocab_size\n",
"input_size_decoder = target_vocab_size\n",
"output_size_decoder = target_vocab_size\n",
"encoder_embedding_size = 300\n",
"decoder_embedding_size = 300\n",
"hidden_size = 1024\n",
"num_layers = 2\n",
"encoder_dropout = 0.5\n",
"decoder_dropout = 0.5\n",
"\n",
"encoder_net = Encoder(input_size=input_size_encoder, \n",
" embedding_size=encoder_embedding_size, \n",
" hidden_size=hidden_size, \n",
" num_layers=num_layers,\n",
" dropout_prob=encoder_dropout)\n",
"\n",
"decoder_net = Decoder(input_size=input_size_decoder, \n",
" embedding_size=decoder_embedding_size, \n",
" hidden_size=hidden_size, \n",
" num_layers=num_layers,\n",
" dropout_prob=decoder_dropout,\n",
" output_size=output_size_decoder)\n",
"\n",
"model = Seq2Seq(encoder=encoder_net, decoder=decoder_net)\n",
"\n",
"criterion = nn.CrossEntropyLoss(ignore_index=3)\n",
"optimizer = optim.Adam(model.parameters(), lr=learning_rate)\n",
"\n",
"for epoch in range(num_epochs):\n",
" print('Epoch {}/{}'.format(epoch + 1, num_epochs))\n",
" \n",
" for batch_idx, (x_train, y_train) in enumerate(training_data(source=en, \n",
" target=en)):\n",
" output = model(x_train, y_train)\n",
" output = output[1:].reshape(-1, output.shape[2])\n",
" y_train = y_train[1:].reshape(-1)\n",
" \n",
" optimizer.zero_grad()\n",
" loss = criterion(output, y_train)\n",
" \n",
" loss.backward()\n",
" \n",
" torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\n",
" print(batch_idx)\n",
" \n",
" print(\"loss: \" + str(loss.item()))\n"
],
"metadata": {
"collapsed": false
},
"id": "ee166d65b3b975d",
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:initial_id tags:
```
python
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
import
random
from
tokenizers
import
Tokenizer
from
tokenizers.models
import
BPE
from
tokenizers.trainers
import
BpeTrainer
from
tokenizers.pre_tokenizers
import
Whitespace
from
tokenizers.processors
import
TemplateProcessing
```
%% Cell type:markdown id:2b9477923b668978 tags:
# Data Preparation
%% Cell type:code id:dbc5f26f27746098 tags:
```
python
def
load_data
()
->
tuple
[
list
[
str
],
list
[
str
]]:
with
open
(
"
data/training-data/eup/europarl-v7.de-en.de
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_de
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
with
open
(
"
data/training-data/eup/europarl-v7.de-en.en
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_en
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
ltd
=
set
()
# save lines to delete later
for
i
in
range
(
max
(
len
(
data_de
),
len
(
data_en
))):
# Move sentence to next line if line is empty other file
if
data_de
[
i
]
==
""
:
data_en
[
i
+
1
]
=
data_en
[
i
]
+
"
"
+
data_en
[
i
+
1
]
ltd
.
add
(
i
)
if
data_en
[
i
]
==
""
:
data_de
[
i
+
1
]
=
data_de
[
i
]
+
"
"
+
data_de
[
i
+
1
]
ltd
.
add
(
i
)
# Remove lines, where difference in words is > 40%
if
abs
(
count_words
(
data_de
[
i
])
-
count_words
(
data_en
[
i
]))
/
(
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
+
1
)
>
0.4
:
ltd
.
add
(
i
)
# Remove lines < 3 words or > 10 words
if
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
<
3
or
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
>
10
:
ltd
.
add
(
i
)
temp_de
=
[
l
for
i
,
l
in
enumerate
(
data_de
)
if
i
not
in
ltd
]
data_de
=
temp_de
temp_en
=
[
l
for
i
,
l
in
enumerate
(
data_en
)
if
i
not
in
ltd
]
data_en
=
temp_en
print
(
len
(
data_de
),
len
(
data_en
))
# Print 3 random sentence pairs
ix
=
torch
.
randint
(
low
=
0
,
high
=
max
(
len
(
data_de
),
len
(
data_en
)),
size
=
(
3
,
))
for
i
in
ix
:
print
(
f
"
Zeile:
{
i
}
\n
Deutsch:
{
data_de
[
i
]
}
\n
English:
{
data_en
[
i
]
}
\n
"
)
print
(
f
"
\n
Number of lines:
{
len
(
data_de
),
len
(
data_en
)
}
"
)
return
data_de
,
data_en
def
count_words
(
string
:
str
)
->
int
:
return
len
(
string
.
split
())
de
,
en
=
load_data
()
# setting the unknown token (e.g. for emojis)
tokenizer_en
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
tokenizer_de
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer
=
BpeTrainer
(
vocab_size
=
10000
,
special_tokens
=
[
"
[UNK]
"
,
"
[SOS]
"
,
"
[EOS]
"
,
"
[PAD]
"
])
tokenizer_en
.
pre_tokenizer
=
Whitespace
()
tokenizer_de
.
pre_tokenizer
=
Whitespace
()
tokenizer_en
.
train
([
"
data/training-data/eup/europarl-v7.de-en.en
"
],
trainer
)
tokenizer_de
.
train
([
"
data/training-data/eup/europarl-v7.de-en.de
"
],
trainer
)
# configure post processing
tokenizer_en
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_en
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_en
.
token_to_id
(
"
[EOS]
"
)),
],
)
tokenizer_de
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_de
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_de
.
token_to_id
(
"
[EOS]
"
)),
],
)
target_vocab_size
=
tokenizer_de
.
get_vocab_size
()
source_vocab_size
=
tokenizer_en
.
get_vocab_size
()
```
%% Cell type:code id:8edfacb67dc8c527 tags:
```
python
def
training_data
(
source
:
list
[
str
],
target
:
list
[
str
],
batch_size
:
int
=
64
,
sort
:
bool
=
True
)
->
tuple
[
torch
.
tensor
,
torch
.
tensor
]:
tokenizer_de
.
no_padding
()
tokenizer_en
.
no_padding
()
# sort the training data if true
if
sort
:
temp
=
([
list
(
a
)
for
a
in
zip
(
source
,
target
)])
temp
.
sort
(
key
=
lambda
s
:
len
(
s
[
0
])
+
len
(
s
[
1
]))
source
,
target
=
list
(
zip
(
*
temp
))
# select random sentences
for
i
in
range
(
0
,
len
(
source
)
-
batch_size
,
batch_size
):
x_training_data
=
source
[
i
:
i
+
batch_size
]
y_training_data
=
target
[
i
:
i
+
batch_size
]
# tokenize data
tokenizer_en
.
enable_padding
(
pad_id
=
3
)
x_training_data
=
tokenizer_en
.
encode_batch
(
x_training_data
)
tokenizer_de
.
enable_padding
(
pad_id
=
3
)
y_training_data
=
tokenizer_de
.
encode_batch
(
y_training_data
)
# extract ids for every sequence
for
j
in
range
(
batch_size
):
x_training_data
[
j
]
=
x_training_data
[
j
].
ids
y_training_data
[
j
]
=
y_training_data
[
j
].
ids
# put data into tensor
x_training_data
=
torch
.
tensor
(
x_training_data
)
y_training_data
=
torch
.
tensor
(
y_training_data
)
# transpose tensors to match input requirements for lstm
x_training_data
=
torch
.
transpose
(
x_training_data
,
0
,
1
)
y_training_data
=
torch
.
transpose
(
y_training_data
,
0
,
1
)
yield
x_training_data
,
y_training_data
```
%% Cell type:code id:524195fe40653308 tags:
```
python
# data test cell
print
(
len
(
de
)
/
64
)
for
idx
,
_
in
enumerate
(
training_data
(
source
=
de
,
target
=
en
,
batch_size
=
64
)):
print
(
idx
)
```
%% Cell type:markdown id:ca6d3d436fd31e33 tags:
### Model Definition
%% Cell type:code id:3b2c4dbc74a1f144 tags:
```
python
# Prepare model
class
Encoder
(
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
embedding_size
:
int
,
hidden_size
:
int
,
num_layers
:
int
,
dropout_prob
:
float
):
super
(
Encoder
,
self
).
__init__
()
self
.
hidden_size
=
hidden_size
self
.
num_layers
=
num_layers
self
.
dropout
=
nn
.
Dropout
(
dropout_prob
)
self
.
embedding
=
nn
.
Embedding
(
input_size
,
embedding_size
)
self
.
rnn
=
nn
.
LSTM
(
input_size
=
embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
dropout
=
dropout_prob
)
def
forward
(
self
,
x
):
# shape x : (sequence_len, batch_size)
embedding
=
self
.
dropout
(
self
.
embedding
(
x
))
# shape embedding : sequence_len, batch_size, embedding_size)
output
,
(
hidden
,
cell
)
=
self
.
rnn
(
embedding
)
return
hidden
,
cell
class
Decoder
(
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
embedding_size
:
int
,
hidden_size
:
int
,
num_layers
:
int
,
output_size
:
int
,
dropout_prob
:
float
):
super
(
Decoder
,
self
).
__init__
()
self
.
hidden_size
=
hidden_size
self
.
num_layers
=
num_layers
self
.
dropout
=
nn
.
Dropout
(
dropout_prob
)
self
.
embedding
=
nn
.
Embedding
(
input_size
,
embedding_size
)
self
.
rnn
=
nn
.
LSTM
(
input_size
=
embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
dropout
=
dropout_prob
)
self
.
fc
=
nn
.
Linear
(
hidden_size
,
output_size
)
def
forward
(
self
,
x
,
hidden
,
cell
):
x
=
x
.
view
(
1
,
-
1
)
# shape x : (1, batch_size)
embedding
=
self
.
dropout
(
self
.
embedding
(
x
))
# embedding shape : (1, batch_size, embedding_size)
output
,
(
hidden
,
cell
)
=
self
.
rnn
(
embedding
,
(
hidden
,
cell
))
# shape output : (1, batch_size, hidden_size)
predictions
=
self
.
fc
(
output
)
# shape predictions : (1, batch_size, vocab_len)
predictions
=
predictions
.
squeeze
(
1
)
return
predictions
,
hidden
,
cell
class
Seq2Seq
(
nn
.
Module
):
def
__init__
(
self
,
encoder
:
Encoder
,
decoder
:
Decoder
):
super
(
Seq2Seq
,
self
).
__init__
()
self
.
encoder
=
encoder
self
.
decoder
=
decoder
def
forward
(
self
,
source
,
target
,
teacher_forcing_ratio
:
float
=
0.5
):
batch_size
=
source
.
shape
[
1
]
target_len
=
target
.
shape
[
0
]
outputs
=
torch
.
zeros
(
target_len
,
batch_size
,
target_vocab_size
)
hidden
,
cell
=
self
.
encoder
(
source
)
x
=
target
[
0
]
for
t
in
range
(
1
,
target_len
):
output
,
hidden
,
cell
=
self
.
decoder
(
x
,
hidden
,
cell
)
outputs
[
t
]
=
output
best_guess
=
output
.
argmax
(
2
)
x
=
target
[
t
]
if
random
.
random
()
<
teacher_forcing_ratio
else
best_guess
return
outputs
```
%% Cell type:markdown id:9854eaee8392caa1 tags:
### Model Training
%% Cell type:code id:ee166d65b3b975d tags:
```
python
# training hyperparameters
num_epochs
=
20
learning_rate
=
0.001
batch_size
=
64
# model hyperparameters
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
input_size_encoder
=
source_vocab_size
input_size_decoder
=
target_vocab_size
output_size_decoder
=
target_vocab_size
encoder_embedding_size
=
300
decoder_embedding_size
=
300
hidden_size
=
1024
num_layers
=
2
encoder_dropout
=
0.5
decoder_dropout
=
0.5
encoder_net
=
Encoder
(
input_size
=
input_size_encoder
,
embedding_size
=
encoder_embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
dropout_prob
=
encoder_dropout
)
decoder_net
=
Decoder
(
input_size
=
input_size_decoder
,
embedding_size
=
decoder_embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
dropout_prob
=
decoder_dropout
,
output_size
=
output_size_decoder
)
model
=
Seq2Seq
(
encoder
=
encoder_net
,
decoder
=
decoder_net
)
criterion
=
nn
.
CrossEntropyLoss
(
ignore_index
=
3
)
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
learning_rate
)
for
epoch
in
range
(
num_epochs
):
print
(
'
Epoch {}/{}
'
.
format
(
epoch
+
1
,
num_epochs
))
for
batch_idx
,
(
x_train
,
y_train
)
in
enumerate
(
training_data
(
source
=
en
,
target
=
en
)):
output
=
model
(
x_train
,
y_train
)
output
=
output
[
1
:].
reshape
(
-
1
,
output
.
shape
[
2
])
y_train
=
y_train
[
1
:].
reshape
(
-
1
)
optimizer
.
zero_grad
()
loss
=
criterion
(
output
,
y_train
)
loss
.
backward
()
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
max_norm
=
1
)
print
(
batch_idx
)
print
(
"
loss:
"
+
str
(
loss
.
item
()))
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment