Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
nlp-machine-translation-project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marvin Schaefers
nlp-machine-translation-project
Commits
e9945170
There was an error fetching the commit references. Please try again later.
Commit
e9945170
authored
Jan 8, 2024
by
marvnsch
Browse files
Options
Downloads
Patches
Plain Diff
Push changes
parent
9e70f019
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
exploration.ipynb
+65
-75
65 additions, 75 deletions
exploration.ipynb
with
65 additions
and
75 deletions
exploration.ipynb
+
65
−
75
View file @
e9945170
...
...
@@ -341,15 +341,15 @@
"collapsed": false
},
"source": [
"## 3. Build the sequence2sequence
RNN
"
"## 3. Build the sequence2sequence
LSTM
"
]
},
{
"cell_type": "code",
"execution_count":
11
,
"execution_count":
62
,
"outputs": [],
"source": [
"embedding_dimension =
1
00\n",
"embedding_dimension =
5
00\n",
"\n",
"embedding_matrix_enc = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)\n",
"embedding_matrix_dec = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dimension)\n",
...
...
@@ -361,6 +361,9 @@
" self._hidden_size = hidden_size\n",
" self._num_layers = num_layers\n",
"\n",
" # embedding matrix\n",
" self._embedding = embedding_matrix_enc\n",
" \n",
" # lstm layer\n",
" self._lstm = torch. nn.LSTM(input_size=input_size,\n",
" hidden_size=hidden_size,\n",
...
...
@@ -368,16 +371,12 @@
" bidirectional=bidirectional,\n",
" batch_first=True)\n",
" \n",
" self._dropout = torch.nn.Dropout(0.1)\n",
" \n",
" \n",
" \n",
" def forward(self, embedded_sequence: torch.Tensor):\n",
" h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #hidden state WITH batches\n",
" c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size) #internal state WITH batches\n",
" def forward(self, sequence: torch.Tensor):\n",
" embedded_sequence = self._embedding(sequence)\n",
" h_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)\n",
" c_0 = torch.zeros(self._num_layers, embedded_sequence.size(0), self._hidden_size)\n",
" \n",
" output, (hn, cn) = self._lstm(embedded_sequence, (h_0, c_0))\n",
" \n",
" return output, hn, cn\n",
" \n",
"\n",
...
...
@@ -404,10 +403,9 @@
" # output layer (fully connected linear layer)\n",
" self._out = nn.Linear(hidden_size, output_size)\n",
" \n",
" def forward(self, x):\n",
" batch_size = x[0].size(0)\n",
" hidden_state = x[1]\n",
" cell_state = x[2]\n",
" def forward(self, enc_out: torch.tensor, hidden_state: torch.tensor,\n",
" cell_state: torch.tensor, target_tensor: torch.tensor = None):\n",
" batch_size = enc_out.size(0)\n",
" outputs = []\n",
" \n",
" # prepare start token\n",
...
...
@@ -417,12 +415,16 @@
" out, hidden_state, cell_state = self.forward_step(x_in, hidden_state, cell_state)\n",
" outputs.append(out)\n",
"\n",
" if target_tensor is not None:\n",
" # Teacher forcing: Feed the target as the next input\n",
" x_in = target_tensor[:, i].unsqueeze(1) # Teacher forcing\n",
" else:\n",
" # Without teacher forcing: use its own predictions as the next input\n",
" _, topi = out.topk(1)\n",
" x_in = topi.squeeze(-1).detach() # detach from history as input\n",
"\n",
" outputs = torch.cat(outputs, dim=1)
# WTF is happening here!? -> TODO: Understand the code
\n",
" outputs = F.log_softmax(outputs, dim=-1)\n",
" outputs = torch.cat(outputs, dim=1)\n",
"
#
outputs = F.log_softmax(outputs, dim=-1)\n",
" return outputs, hidden_state, cell_state\n",
" \n",
" def forward_step(self, x_in, hidden_state, cell_state):\n",
...
...
@@ -435,8 +437,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-0
6
T1
6:15:23.365173
Z",
"start_time": "2024-01-0
6
T1
6:15:23.251387
Z"
"end_time": "2024-01-0
8
T1
7:04:59.155420
Z",
"start_time": "2024-01-0
8
T1
7:04:58.607834
Z"
}
},
"id": "e8d99510479108f4"
...
...
@@ -453,80 +455,58 @@
},
{
"cell_type": "code",
"execution_count":
17
,
"execution_count":
63
,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---- Iteration 10 ----\n",
"loss:
9.291775703430176
\n",
"loss:
-0.045227549970149994
\n",
"---- Iteration 20 ----\n",
"loss:
6.553857803344727
\n",
"loss:
-0.1404053419828415
\n",
"---- Iteration 30 ----\n",
"loss: 4.213151454925537\n",
"---- Iteration 40 ----\n",
"loss: 3.1044561862945557\n",
"---- Iteration 50 ----\n",
"loss: 3.47859263420105\n",
"---- Iteration 60 ----\n",
"loss: 3.166140079498291\n",
"---- Iteration 70 ----\n",
"loss: 3.3509914875030518\n",
"---- Iteration 80 ----\n",
"loss: 2.626647710800171\n",
"---- Iteration 90 ----\n",
"loss: 3.137316942214966\n",
"---- Iteration 100 ----\n",
"loss: 3.088139295578003\n",
"---- Iteration 110 ----\n",
"loss: 2.9085235595703125\n",
"---- Iteration 120 ----\n",
"loss: 2.8475253582000732\n"
]
},
{
"ename": "ValueError",
"evalue": "expected sequence of length 50 at dim 1 (got 62)",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[17], line 20\u001B[0m\n\u001B[1;32m 17\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[1;32m 19\u001B[0m \u001B[38;5;66;03m# make prediction\u001B[39;00m\n\u001B[0;32m---> 20\u001B[0m x_train, y_train \u001B[38;5;241m=\u001B[39m \u001B[43mtraining_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbatch_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m32\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmax_tokens\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mmax_tokens_per_sequence\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 21\u001B[0m predict \u001B[38;5;241m=\u001B[39m model(x_train)[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m 23\u001B[0m \u001B[38;5;66;03m# match dimensions of prediction & gold_label vector\u001B[39;00m\n",
"Cell \u001B[0;32mIn[8], line 28\u001B[0m, in \u001B[0;36mtraining_data\u001B[0;34m(batch_size, max_tokens)\u001B[0m\n\u001B[1;32m 24\u001B[0m x_training_data \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mtensor(x_training_data)\n\u001B[1;32m 26\u001B[0m \u001B[38;5;66;03m# 'tensorfy' & one hot encode y data\u001B[39;00m\n\u001B[1;32m 27\u001B[0m \u001B[38;5;66;03m#y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)\u001B[39;00m\n\u001B[0;32m---> 28\u001B[0m y_training_data \u001B[38;5;241m=\u001B[39m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtensor\u001B[49m\u001B[43m(\u001B[49m\u001B[43my_training_data\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 29\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m x_training_data, y_training_data\n",
"\u001B[0;31mValueError\u001B[0m: expected sequence of length 50 at dim 1 (got 62)"
"loss: -0.22252100706100464\n"
]
}
],
"source": [
"LSTM_hidden_size =
128
\n",
"LSTM_hidden_size =
500
\n",
"max_tokens_per_sequence = 70\n",
"\n",
"model = nn.Sequential(\n",
" embedding_matrix_enc,\n",
" Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size),\n",
" Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,\n",
" output_size=vocab_size, max_tokens=max_tokens_per_sequence)\n",
")\n",
"\n",
"num_epochs = 1000\n",
"optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)\n",
"loss_function = torch.nn.NLLLoss()\n",
"num_epochs = 30\n",
"optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"\n",
"# create encoder / decoder instances\n",
"encoder = Encoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size)\n",
"decoder = Decoder(input_size=embedding_dimension, hidden_size=LSTM_hidden_size,\n",
" output_size=vocab_size, max_tokens=max_tokens_per_sequence)\n",
"\n",
"for i in range(1, num_epochs + 1):\n",
" # reset gradients\n",
" optimizer.zero_grad()\n",
"\n",
" #
make prediction
\n",
" #
get training data
\n",
" x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)\n",
" predict = model(x_train)[0]\n",
"\n",
" # make prediction\n",
" encoder_out, encoder_h, encoder_c = encoder(x_train)\n",
" predict = decoder(encoder_out, encoder_h, encoder_c, y_train)\n",
" predict = predict[0]\n",
"\n",
" # match dimensions of prediction & gold_label vector\n",
" predict = predict.view(-1, predict.size(-1))\n",
" y_train = y_train.view(-1)\n",
"\n",
" # calculate loss & propagate it backwards\n",
" loss =
loss_funct
ion(predict, y_train)\n",
" loss =
criter
ion(predict, y_train)\n",
" loss.backward()\n",
"\n",
" optimizer.step()\n",
...
...
@@ -537,8 +517,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-0
6
T1
6:19:42.036848
Z",
"start_time": "2024-01-0
6
T1
6:18:33.291526
Z"
"end_time": "2024-01-0
8
T1
7:06:02.509807
Z",
"start_time": "2024-01-0
8
T1
7:05:01.822284
Z"
}
},
"id": "1f8d3152359f6658"
...
...
@@ -555,43 +535,53 @@
},
{
"cell_type": "code",
"execution_count":
1
6,
"execution_count": 6
1
,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]\n"
"['[CLS]', 'H', 'i', 'are', 'you', 'there', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']\n",
"tensor([ 1, 1, 1, 32460, 32460, 28617, 7251, 47089, 38628, 38628,\n",
" 38628, 38495, 31040, 14290, 4593, 41094, 13045, 17103, 45127, 18564,\n",
" 5320, 5320, 5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640,\n",
" 5320, 5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320,\n",
" 5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320,\n",
" 5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320, 5320,\n",
" 5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320, 5320, 5320])\n"
]
},
{
"data": {
"text/plain": "'
Ich ist der der der
'"
"text/plain": "'
Quantität Quantität drastischen committees Vermächtnis jana jana jana Aufrichtigkeit Fusionen ströme More Tellereisen Mod verschoben emaking auszubauen unch unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch
'"
},
"execution_count":
1
6,
"execution_count": 6
1
,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_sequence = (\"
Ist dies der Weg, oder nicht?.
\")\n",
"test_sequence = (\"
Hi are you there
\")\n",
"\n",
"test_sequence_enc = tokenizer.encode(test_sequence)\n",
"\n",
"print(test_sequence_enc.
id
s)\n",
"print(test_sequence_enc.
token
s)\n",
"\n",
"test_sequence_batched = torch.tensor(test_sequence_enc.ids).view(1, -1)\n",
"\n",
"predict, _, _ = model(test_sequence_batched)\n",
"encoder_out, encoder_h, encoder_c = encoder(test_sequence_batched)\n",
"predict, _, _ = decoder(encoder_out, encoder_h, encoder_c)\n",
"\n",
"_, topi = predict.topk(1)\n",
"decoded_ids = topi.squeeze()\n",
"print(decoded_ids)\n",
"tokenizer.decode(list(decoded_ids))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-0
6T16:17:11.259876
Z",
"start_time": "2024-01-0
6
T1
6:17:11.211111
Z"
"end_time": "2024-01-0
8T17:04:27.406472
Z",
"start_time": "2024-01-0
8
T1
7:04:27.078634
Z"
}
},
"id": "b95fb365f686125d"
...
...
%% Cell type:code id:initial_id tags:
```
python
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
seaborn
as
sns
from
matplotlib
import
pyplot
as
plt
from
pathlib
import
Path
```
%% Cell type:markdown id:d8d7f32150682efd tags:
## 0. Prepare the data
%% Cell type:code id:f7c39c06ce3a14db tags:
```
python
def
load_data
()
->
list
[
str
]:
with
open
(
"
data/training-data/eup/europarl-v7.de-en.de
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_de
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
with
open
(
"
data/training-data/eup/europarl-v7.de-en.en
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_en
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
ltd
=
set
()
# save lines to delete later
for
i
in
range
(
max
(
len
(
data_de
),
len
(
data_en
))):
# Move sentence to next line if line is empty other file
if
data_de
[
i
]
==
""
:
data_en
[
i
+
1
]
=
data_en
[
i
]
+
"
"
+
data_en
[
i
+
1
]
ltd
.
add
(
i
)
if
data_en
[
i
]
==
""
:
data_de
[
i
+
1
]
=
data_de
[
i
]
+
"
"
+
data_de
[
i
+
1
]
ltd
.
add
(
i
)
# Remove lines, where difference in words is > 40%
if
abs
(
count_words
(
data_de
[
i
])
-
count_words
(
data_en
[
i
]))
/
(
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
+
1
)
>
0.4
:
ltd
.
add
(
i
)
# Remove lines < 3 words or > 25 words
if
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
<
3
or
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
>
25
:
ltd
.
add
(
i
)
temp_de
=
[
l
for
i
,
l
in
enumerate
(
data_de
)
if
i
not
in
ltd
]
data_de
=
temp_de
temp_en
=
[
l
for
i
,
l
in
enumerate
(
data_en
)
if
i
not
in
ltd
]
data_en
=
temp_en
print
(
len
(
data_de
),
len
(
data_en
))
# Print 3 random sentence pairs
ix
=
torch
.
randint
(
low
=
0
,
high
=
max
(
len
(
data_de
),
len
(
data_en
)),
size
=
(
3
,
))
for
i
in
ix
:
print
(
f
"
Zeile:
{
i
}
\n
Deutsch:
{
data_de
[
i
]
}
\n
English:
{
data_en
[
i
]
}
\n
"
)
print
(
f
"
\n
Number of lines:
{
len
(
data_de
),
len
(
data_en
)
}
"
)
return
data_de
,
data_en
def
count_words
(
string
:
str
)
->
int
:
return
len
(
string
.
split
())
source
,
target
=
load_data
()
```
%% Output
1046809 1046809
Zeile: 993209
Deutsch: Aber es muß auch darum gehen, Anreize zu schaffen für einen umweltfreundlichen lokalen öffentlichen Nahverkehr.
English: But it is also necessary to create incentives for environmentally friendly local public transport.
Zeile: 459853
Deutsch: Vielleicht sollte er dramatisch verlangsamt werden?
English: Perhaps it should be slowed down dramatically?
Zeile: 605086
Deutsch: Die Prämien haben im Übrigen durchaus positive grenzüberschreitende Wirkungen.
English: The incentives have also had a positive cross-border impact.
Number of lines: (1046809, 1046809)
%% Cell type:markdown id:f2beddcc4122495a tags:
## 1. Text tokenization
%% Cell type:code id:d8ccbafa97fba573 tags:
```
python
# set up the tokenizer
from
tokenizers
import
Tokenizer
from
tokenizers.models
import
BPE
from
tokenizers.trainers
import
BpeTrainer
from
tokenizers.processors
import
TemplateProcessing
# setting the unknown token (e.g. for emojis)
tokenizer
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
# [MASK] : bad words!?
trainer
=
BpeTrainer
(
vocab_size
=
50000
,
special_tokens
=
[
"
[UNK]
"
,
"
[CLS]
"
,
"
[SEP]
"
,
"
[PAD]
"
,
"
[MASK]
"
])
# set up the pre-tokenizer -> this ensures, that the maximal token length is one word
from
tokenizers.pre_tokenizers
import
Whitespace
tokenizer
.
pre_tokenizer
=
Whitespace
()
```
%% Cell type:code id:55cbac65a50a0199 tags:
```
python
tokenizer
.
train
([
"
data/training-data/eup/europarl-v7.de-en.de
"
,
"
data/training-data/eup/europarl-v7.de-en.en
"
],
trainer
)
# configure post processing
tokenizer
.
post_processor
=
TemplateProcessing
(
single
=
"
[CLS] $A [SEP]
"
,
pair
=
"
[CLS] $A [SEP] $B:1 [SEP]:1
"
,
special_tokens
=
[
(
"
[CLS]
"
,
tokenizer
.
token_to_id
(
"
[CLS]
"
)),
(
"
[SEP]
"
,
tokenizer
.
token_to_id
(
"
[SEP]
"
)),
],
)
vocab_size
=
tokenizer
.
get_vocab_size
()
```
%% Output
%% Cell type:markdown id:9c0f853775a802ec tags:
## 2. Prepare the training data
%% Cell type:code id:2e4dc87ce98b6cdd tags:
```
python
# Prepare training batch
def
training_data
(
batch_size
:
int
=
10
,
max_tokens
:
int
=
50
)
->
tuple
[
torch
.
tensor
,
torch
.
tensor
]:
x_training_data
=
[]
y_training_data
=
[]
# select random sentences
batch_indices
=
torch
.
randint
(
0
,
len
(
source
),
(
batch_size
,
))
for
idx
in
batch_indices
:
x_training_data
.
append
(
target
[
idx
])
y_training_data
.
append
(
source
[
idx
])
# tokenize data
tokenizer
.
enable_padding
(
pad_id
=
3
)
x_training_data
=
tokenizer
.
encode_batch
(
x_training_data
)
tokenizer
.
enable_padding
(
pad_id
=
3
,
length
=
max_tokens
)
y_training_data
=
tokenizer
.
encode_batch
(
y_training_data
)
# extract ids for every sequence
for
i
in
range
(
len
(
batch_indices
)):
x_training_data
[
i
]
=
x_training_data
[
i
].
ids
y_training_data
[
i
]
=
y_training_data
[
i
].
ids
# 'tensorfy' x data
x_training_data
=
torch
.
tensor
(
x_training_data
)
# 'tensorfy' & one hot encode y data
#y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
y_training_data
=
torch
.
tensor
(
y_training_data
)
return
x_training_data
,
y_training_data
print
(
training_data
())
```
%% Output
(tensor([[ 1, 556, 9472, 344, 386, 346, 984, 362, 472, 18,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3],
[ 1, 502, 565, 8649, 9315, 401, 346, 1625, 1566, 16999,
2138, 401, 10036, 346, 72, 17, 14392, 1849, 363, 15202,
17748, 335, 344, 18, 2, 3, 3],
[ 1, 721, 342, 963, 335, 459, 1522, 360, 3311, 363,
1008, 13528, 25339, 18, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3],
[ 1, 2657, 440, 4595, 3001, 20519, 362, 346, 3019, 1561,
39484, 12799, 335, 3897, 17, 13314, 472, 18, 2, 3,
3, 3, 3, 3, 3, 3, 3],
[ 1, 10169, 525, 359, 632, 360, 15202, 68, 4380, 375,
37324, 477, 19158, 936, 914, 342, 13922, 360, 359, 346,
1445, 1403, 335, 2624, 18, 2, 3],
[ 1, 14207, 16, 346, 4097, 362, 346, 5604, 4193, 10550,
338, 6024, 6130, 1009, 363, 341, 22069, 4488, 29208, 18,
2, 3, 3, 3, 3, 3, 3],
[ 1, 721, 546, 2619, 1048, 401, 21666, 7450, 360, 359,
6691, 360, 4356, 4897, 8267, 18, 2, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3],
[ 1, 721, 342, 470, 3184, 401, 990, 1478, 668, 359,
21775, 9083, 37514, 362, 346, 8748, 401, 882, 7700, 18,
2, 3, 3, 3, 3, 3, 3],
[ 1, 721, 342, 4702, 4371, 401, 380, 486, 6034, 7872,
363, 401, 380, 659, 14235, 2583, 335, 1141, 360, 1258,
1313, 882, 486, 39184, 682, 18, 2],
[ 1, 19047, 16, 941, 987, 16, 882, 440, 360, 359,
764, 1251, 401, 5140, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3]]), tensor([[ 1, 596, 578, 339, 435, 367, 956, 2403, 7978, 18,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 582, 619, 415, 12866, 2317, 5209, 16, 475, 352,
8180, 3296, 4473, 8201, 16, 367, 367, 1755, 510, 372,
12175, 24927, 10069, 369, 3211, 9613, 17730, 3142, 18, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 467, 848, 2154, 4491, 352, 2016, 369, 367, 3126,
442, 6647, 8666, 23685, 448, 2812, 2171, 18, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 720, 586, 7415, 17167, 352, 1744, 16, 549, 618,
628, 381, 729, 17335, 1014, 4289, 3610, 852, 12465, 3018,
18, 2, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 720, 570, 1359, 636, 6085, 27733, 1771, 784, 471,
11607, 510, 367, 28935, 406, 9854, 1962, 335, 2624, 20901,
834, 18, 2, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 5097, 6837, 367, 23413, 10643, 501, 442, 352, 7203,
18377, 369, 442, 35812, 3579, 12270, 34707, 396, 18, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 802, 415, 1869, 5362, 16, 655, 352, 7618, 729,
549, 504, 474, 335, 352, 2014, 415, 16, 13502, 3055,
19139, 18, 2, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 802, 786, 474, 7207, 16, 475, 1208, 2829, 17233,
358, 22631, 500, 18, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 802, 415, 6441, 12248, 16, 475, 416, 448, 7130,
9857, 4790, 369, 475, 416, 5613, 2122, 5719, 834, 16,
427, 383, 1639, 16, 25574, 628, 12850, 18, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
[ 1, 2475, 642, 339, 981, 3171, 916, 824, 500, 5140,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]))
%% Cell type:markdown id:689e2e565cce2845 tags:
## 3. Build the sequence2sequence
RNN
## 3. Build the sequence2sequence
LSTM
%% Cell type:code id:e8d99510479108f4 tags:
```
python
embedding_dimension
=
1
00
embedding_dimension
=
5
00
embedding_matrix_enc
=
torch
.
nn
.
Embedding
(
num_embeddings
=
vocab_size
,
embedding_dim
=
embedding_dimension
)
embedding_matrix_dec
=
torch
.
nn
.
Embedding
(
num_embeddings
=
vocab_size
,
embedding_dim
=
embedding_dimension
)
class
Encoder
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
hidden_size
:
int
,
num_layers
:
int
=
1
,
bidirectional
:
bool
=
False
):
super
(
Encoder
,
self
).
__init__
()
self
.
_hidden_size
=
hidden_size
self
.
_num_layers
=
num_layers
# embedding matrix
self
.
_embedding
=
embedding_matrix_enc
# lstm layer
self
.
_lstm
=
torch
.
nn
.
LSTM
(
input_size
=
input_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
bidirectional
=
bidirectional
,
batch_first
=
True
)
self
.
_dropout
=
torch
.
nn
.
Dropout
(
0.1
)
def
forward
(
self
,
embedded_sequence
:
torch
.
Tensor
):
h_0
=
torch
.
zeros
(
self
.
_num_layers
,
embedded_sequence
.
size
(
0
),
self
.
_hidden_size
)
#hidden state WITH batches
c_0
=
torch
.
zeros
(
self
.
_num_layers
,
embedded_sequence
.
size
(
0
),
self
.
_hidden_size
)
#internal state WITH batches
def
forward
(
self
,
sequence
:
torch
.
Tensor
):
embedded_sequence
=
self
.
_embedding
(
sequence
)
h_0
=
torch
.
zeros
(
self
.
_num_layers
,
embedded_sequence
.
size
(
0
),
self
.
_hidden_size
)
c_0
=
torch
.
zeros
(
self
.
_num_layers
,
embedded_sequence
.
size
(
0
),
self
.
_hidden_size
)
output
,
(
hn
,
cn
)
=
self
.
_lstm
(
embedded_sequence
,
(
h_0
,
c_0
))
return
output
,
hn
,
cn
class
Decoder
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
hidden_size
:
int
,
output_size
:
int
,
num_layers
:
int
=
1
,
bidirectional
:
bool
=
False
,
max_tokens
:
int
=
40
):
super
(
Decoder
,
self
).
__init__
()
self
.
_hidden_size
=
hidden_size
self
.
_num_layers
=
num_layers
self
.
_max_tokens
=
max_tokens
# embedding matrix
self
.
_embedding
=
embedding_matrix_dec
# lstm layer
self
.
_lstm
=
torch
.
nn
.
LSTM
(
input_size
=
input_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
bidirectional
=
bidirectional
,
batch_first
=
True
)
# output layer (fully connected linear layer)
self
.
_out
=
nn
.
Linear
(
hidden_size
,
output_size
)
def
forward
(
self
,
x
):
batch_size
=
x
[
0
].
size
(
0
)
hidden_state
=
x
[
1
]
cell_state
=
x
[
2
]
def
forward
(
self
,
enc_out
:
torch
.
tensor
,
hidden_state
:
torch
.
tensor
,
cell_state
:
torch
.
tensor
,
target_tensor
:
torch
.
tensor
=
None
):
batch_size
=
enc_out
.
size
(
0
)
outputs
=
[]
# prepare start token
x_in
=
torch
.
empty
(
batch_size
,
1
,
dtype
=
torch
.
long
).
fill_
(
1
)
for
i
in
range
(
self
.
_max_tokens
):
out
,
hidden_state
,
cell_state
=
self
.
forward_step
(
x_in
,
hidden_state
,
cell_state
)
outputs
.
append
(
out
)
# Without teacher forcing: use its own predictions as the next input
_
,
topi
=
out
.
topk
(
1
)
x_in
=
topi
.
squeeze
(
-
1
).
detach
()
# detach from history as input
if
target_tensor
is
not
None
:
# Teacher forcing: Feed the target as the next input
x_in
=
target_tensor
[:,
i
].
unsqueeze
(
1
)
# Teacher forcing
else
:
# Without teacher forcing: use its own predictions as the next input
_
,
topi
=
out
.
topk
(
1
)
x_in
=
topi
.
squeeze
(
-
1
).
detach
()
# detach from history as input
outputs
=
torch
.
cat
(
outputs
,
dim
=
1
)
# WTF is happening here!? -> TODO: Understand the code
outputs
=
F
.
log_softmax
(
outputs
,
dim
=-
1
)
outputs
=
torch
.
cat
(
outputs
,
dim
=
1
)
#
outputs = F.log_softmax(outputs, dim=-1)
return
outputs
,
hidden_state
,
cell_state
def
forward_step
(
self
,
x_in
,
hidden_state
,
cell_state
):
output
=
self
.
_embedding
(
x_in
)
output
=
F
.
relu
(
output
)
output
,
(
h_t
,
c_t
)
=
self
.
_lstm
(
output
,
(
hidden_state
,
cell_state
))
output
=
self
.
_out
(
output
)
return
output
,
h_t
,
c_t
```
%% Cell type:markdown id:535bc20b2f12f2da tags:
## 4. Train the model
%% Cell type:code id:1f8d3152359f6658 tags:
```
python
LSTM_hidden_size
=
128
LSTM_hidden_size
=
500
max_tokens_per_sequence
=
70
model
=
nn
.
Sequential
(
embedding_matrix_enc
,
Encoder
(
input_size
=
embedding_dimension
,
hidden_size
=
LSTM_hidden_size
),
Decoder
(
input_size
=
embedding_dimension
,
hidden_size
=
LSTM_hidden_size
,
output_size
=
vocab_size
,
max_tokens
=
max_tokens_per_sequence
)
)
num_epochs
=
1000
optimizer
=
torch
.
optim
.
Adam
(
params
=
model
.
parameters
(),
lr
=
0.001
)
loss_function
=
torch
.
nn
.
NLLLoss
()
num_epochs
=
30
optimizer
=
torch
.
optim
.
Adam
(
params
=
model
.
parameters
(),
lr
=
0.01
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
# create encoder / decoder instances
encoder
=
Encoder
(
input_size
=
embedding_dimension
,
hidden_size
=
LSTM_hidden_size
)
decoder
=
Decoder
(
input_size
=
embedding_dimension
,
hidden_size
=
LSTM_hidden_size
,
output_size
=
vocab_size
,
max_tokens
=
max_tokens_per_sequence
)
for
i
in
range
(
1
,
num_epochs
+
1
):
# reset gradients
optimizer
.
zero_grad
()
#
make prediction
#
get training data
x_train
,
y_train
=
training_data
(
batch_size
=
32
,
max_tokens
=
max_tokens_per_sequence
)
predict
=
model
(
x_train
)[
0
]
# make prediction
encoder_out
,
encoder_h
,
encoder_c
=
encoder
(
x_train
)
predict
=
decoder
(
encoder_out
,
encoder_h
,
encoder_c
,
y_train
)
predict
=
predict
[
0
]
# match dimensions of prediction & gold_label vector
predict
=
predict
.
view
(
-
1
,
predict
.
size
(
-
1
))
y_train
=
y_train
.
view
(
-
1
)
# calculate loss & propagate it backwards
loss
=
loss_funct
ion
(
predict
,
y_train
)
loss
=
criter
ion
(
predict
,
y_train
)
loss
.
backward
()
optimizer
.
step
()
if
i
%
10
==
0
:
print
(
"
---- Iteration
"
+
str
(
i
)
+
"
----
"
)
print
(
"
loss:
"
+
str
(
loss
.
item
()))
```
%% Output
---- Iteration 10 ----
loss:
9.291775703430176
loss:
-0.045227549970149994
---- Iteration 20 ----
loss:
6.553857803344727
loss:
-0.1404053419828415
---- Iteration 30 ----
loss: 4.213151454925537
---- Iteration 40 ----
loss: 3.1044561862945557
---- Iteration 50 ----
loss: 3.47859263420105
---- Iteration 60 ----
loss: 3.166140079498291
---- Iteration 70 ----
loss: 3.3509914875030518
---- Iteration 80 ----
loss: 2.626647710800171
---- Iteration 90 ----
loss: 3.137316942214966
---- Iteration 100 ----
loss: 3.088139295578003
---- Iteration 110 ----
loss: 2.9085235595703125
---- Iteration 120 ----
loss: 2.8475253582000732
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[17], line 20
17 optimizer.zero_grad()
19 # make prediction
---> 20 x_train, y_train = training_data(batch_size=32, max_tokens=max_tokens_per_sequence)
21 predict = model(x_train)[0]
23 # match dimensions of prediction & gold_label vector
Cell In[8], line 28, in training_data(batch_size, max_tokens)
24 x_training_data = torch.tensor(x_training_data)
26 # 'tensorfy' & one hot encode y data
27 #y_training_data = F.one_hot(torch.tensor(y_training_data), num_classes=vocab_size)
---> 28 y_training_data = torch.tensor(y_training_data)
29 return x_training_data, y_training_data
ValueError: expected sequence of length 50 at dim 1 (got 62)
loss: -0.22252100706100464
%% Cell type:markdown id:44f9b74f91565a4a tags:
## 5. Sample from the model
%% Cell type:code id:b95fb365f686125d tags:
```
python
test_sequence
=
(
"
Ist dies der Weg, oder nicht?.
"
)
test_sequence
=
(
"
Hi are you there
"
)
test_sequence_enc
=
tokenizer
.
encode
(
test_sequence
)
print
(
test_sequence_enc
.
id
s
)
print
(
test_sequence_enc
.
token
s
)
test_sequence_batched
=
torch
.
tensor
(
test_sequence_enc
.
ids
).
view
(
1
,
-
1
)
predict
,
_
,
_
=
model
(
test_sequence_batched
)
encoder_out
,
encoder_h
,
encoder_c
=
encoder
(
test_sequence_batched
)
predict
,
_
,
_
=
decoder
(
encoder_out
,
encoder_h
,
encoder_c
)
_
,
topi
=
predict
.
topk
(
1
)
decoded_ids
=
topi
.
squeeze
()
print
(
decoded_ids
)
tokenizer
.
decode
(
list
(
decoded_ids
))
```
%% Output
[1, 6264, 432, 352, 2398, 16, 886, 474, 49994, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
['[CLS]', 'H', 'i', 'are', 'you', 'there', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tensor([ 1, 1, 1, 32460, 32460, 28617, 7251, 47089, 38628, 38628,
38628, 38495, 31040, 14290, 4593, 41094, 13045, 17103, 45127, 18564,
5320, 5320, 5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640,
5320, 5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320,
5320, 5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320,
5320, 5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320, 5320,
5320, 5320, 5320, 7784, 7784, 34640, 5320, 5320, 5320, 5320])
'
Ich ist der der der
'
'
Quantität Quantität drastischen committees Vermächtnis jana jana jana Aufrichtigkeit Fusionen ströme More Tellereisen Mod verschoben emaking auszubauen unch unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch unch unch popul popul Bin unch unch unch unch
'
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment