Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
nlp-machine-translation-project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marvin Schaefers
nlp-machine-translation-project
Commits
c415e5c6
There was an error fetching the commit references. Please try again later.
Commit
c415e5c6
authored
Jan 19, 2024
by
marvnsch
Browse files
Options
Downloads
Patches
Plain Diff
Make functions universal
parent
d72a2c64
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
LSTM_without_attention.ipynb
+44
-3
44 additions, 3 deletions
LSTM_without_attention.ipynb
data/preprocessing.py
+93
-0
93 additions, 0 deletions
data/preprocessing.py
models/RNN_no_attention_unidirectional.py
+10
-0
10 additions, 0 deletions
models/RNN_no_attention_unidirectional.py
with
147 additions
and
3 deletions
LSTM_without_attention.ipynb
+
44
−
3
View file @
c415e5c6
...
...
@@ -33,7 +33,40 @@
},
{
"cell_type": "code",
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"219633 219633\n",
"Zeile: 48925\n",
"Deutsch: Die Sicherheit sollte dabei einen hohen Stellenwert haben.\n",
"English: Safety should assume an important place.\n",
"\n",
"Zeile: 183033\n",
"Deutsch: Aber der Bericht Azzolini bringt neue und wertvolle Denkanstöße.\n",
"English: But the Azzolini report contributes new and worthwhile approaches.\n",
"\n",
"Zeile: 210873\n",
"Deutsch: Wir sind da also tief in Widersprüche verstrickt.\n",
"English: We are therefore overwhelmed with contradictions.\n",
"\n",
"\n",
"Number of lines: (219633, 219633)\n"
]
},
{
"ename": "NameError",
"evalue": "name '__file__' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[7], line 84\u001B[0m\n\u001B[1;32m 73\u001B[0m tokenizer_de\u001B[38;5;241m.\u001B[39mpost_processor \u001B[38;5;241m=\u001B[39m TemplateProcessing(\n\u001B[1;32m 74\u001B[0m single\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m[SOS] $A [EOS]\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 75\u001B[0m special_tokens\u001B[38;5;241m=\u001B[39m[\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 78\u001B[0m ],\n\u001B[1;32m 79\u001B[0m )\n\u001B[1;32m 83\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m---> 84\u001B[0m workdir \u001B[38;5;241m=\u001B[39m Path(\u001B[38;5;18;43m__file__\u001B[39;49m)\u001B[38;5;241m.\u001B[39mparent\u001B[38;5;241m.\u001B[39mabsolute()\n\u001B[1;32m 86\u001B[0m tokenizer_de\u001B[38;5;241m.\u001B[39msave(\u001B[38;5;28mstr\u001B[39m(workdir \u001B[38;5;241m/\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtokenizer_de.json\u001B[39m\u001B[38;5;124m\"\u001B[39m))\n\u001B[1;32m 88\u001B[0m target_vocab_size \u001B[38;5;241m=\u001B[39m tokenizer_de\u001B[38;5;241m.\u001B[39mget_vocab_size()\n",
"\u001B[0;31mNameError\u001B[0m: name '__file__' is not defined"
]
}
],
"source": [
"def load_data() -> tuple[list[str], list[str]]:\n",
" with open(\"data/training-data/eup/europarl-v7.de-en.de\", \"r\", encoding=\"utf8\") as f:\n",
...
...
@@ -115,14 +148,22 @@
" ],\n",
")\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"target_vocab_size = tokenizer_de.get_vocab_size()\n",
"source_vocab_size = tokenizer_en.get_vocab_size()"
],
"metadata": {
"collapsed": false
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-19T13:27:50.053973Z",
"start_time": "2024-01-19T13:27:00.661179Z"
}
},
"id": "dbc5f26f27746098",
"execution_count":
null
"execution_count":
7
},
{
"cell_type": "code",
...
...
%% Cell type:code id:initial_id tags:
```
python
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
import
random
from
tokenizers
import
Tokenizer
from
tokenizers.models
import
BPE
from
tokenizers.trainers
import
BpeTrainer
from
tokenizers.pre_tokenizers
import
Whitespace
from
tokenizers.processors
import
TemplateProcessing
```
%% Cell type:markdown id:2b9477923b668978 tags:
# Data Preparation
%% Cell type:code id:dbc5f26f27746098 tags:
```
python
def
load_data
()
->
tuple
[
list
[
str
],
list
[
str
]]:
with
open
(
"
data/training-data/eup/europarl-v7.de-en.de
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_de
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
with
open
(
"
data/training-data/eup/europarl-v7.de-en.en
"
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
data_en
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
ltd
=
set
()
# save lines to delete later
for
i
in
range
(
max
(
len
(
data_de
),
len
(
data_en
))):
# Move sentence to next line if line is empty other file
if
data_de
[
i
]
==
""
:
data_en
[
i
+
1
]
=
data_en
[
i
]
+
"
"
+
data_en
[
i
+
1
]
ltd
.
add
(
i
)
if
data_en
[
i
]
==
""
:
data_de
[
i
+
1
]
=
data_de
[
i
]
+
"
"
+
data_de
[
i
+
1
]
ltd
.
add
(
i
)
# Remove lines, where difference in words is > 40%
if
abs
(
count_words
(
data_de
[
i
])
-
count_words
(
data_en
[
i
]))
/
(
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
+
1
)
>
0.4
:
ltd
.
add
(
i
)
# Remove lines < 3 words or > 10 words
if
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
<
3
or
max
(
count_words
(
data_de
[
i
]),
count_words
(
data_en
[
i
]))
>
10
:
ltd
.
add
(
i
)
temp_de
=
[
l
for
i
,
l
in
enumerate
(
data_de
)
if
i
not
in
ltd
]
data_de
=
temp_de
temp_en
=
[
l
for
i
,
l
in
enumerate
(
data_en
)
if
i
not
in
ltd
]
data_en
=
temp_en
print
(
len
(
data_de
),
len
(
data_en
))
# Print 3 random sentence pairs
ix
=
torch
.
randint
(
low
=
0
,
high
=
max
(
len
(
data_de
),
len
(
data_en
)),
size
=
(
3
,
))
for
i
in
ix
:
print
(
f
"
Zeile:
{
i
}
\n
Deutsch:
{
data_de
[
i
]
}
\n
English:
{
data_en
[
i
]
}
\n
"
)
print
(
f
"
\n
Number of lines:
{
len
(
data_de
),
len
(
data_en
)
}
"
)
return
data_de
,
data_en
def
count_words
(
string
:
str
)
->
int
:
return
len
(
string
.
split
())
de
,
en
=
load_data
()
# setting the unknown token (e.g. for emojis)
tokenizer_en
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
tokenizer_de
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer
=
BpeTrainer
(
vocab_size
=
10000
,
special_tokens
=
[
"
[UNK]
"
,
"
[SOS]
"
,
"
[EOS]
"
,
"
[PAD]
"
])
tokenizer_en
.
pre_tokenizer
=
Whitespace
()
tokenizer_de
.
pre_tokenizer
=
Whitespace
()
tokenizer_en
.
train
([
"
data/training-data/eup/europarl-v7.de-en.en
"
],
trainer
)
tokenizer_de
.
train
([
"
data/training-data/eup/europarl-v7.de-en.de
"
],
trainer
)
# configure post processing
tokenizer_en
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_en
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_en
.
token_to_id
(
"
[EOS]
"
)),
],
)
tokenizer_de
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_de
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_de
.
token_to_id
(
"
[EOS]
"
)),
],
)
target_vocab_size
=
tokenizer_de
.
get_vocab_size
()
source_vocab_size
=
tokenizer_en
.
get_vocab_size
()
```
%% Output
219633 219633
Zeile: 48925
Deutsch: Die Sicherheit sollte dabei einen hohen Stellenwert haben.
English: Safety should assume an important place.
Zeile: 183033
Deutsch: Aber der Bericht Azzolini bringt neue und wertvolle Denkanstöße.
English: But the Azzolini report contributes new and worthwhile approaches.
Zeile: 210873
Deutsch: Wir sind da also tief in Widersprüche verstrickt.
English: We are therefore overwhelmed with contradictions.
Number of lines: (219633, 219633)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 84
73 tokenizer_de.post_processor = TemplateProcessing(
74 single="[SOS] $A [EOS]",
75 special_tokens=[
(...)
78 ],
79 )
83 from pathlib import Path
---> 84 workdir = Path(__file__).parent.absolute()
86 tokenizer_de.save(str(workdir / "tokenizer_de.json"))
88 target_vocab_size = tokenizer_de.get_vocab_size()
NameError: name '__file__' is not defined
%% Cell type:code id:dac2a6b0b10d6bdf tags:
```
python
# Define Device
if
torch
.
cuda
.
is_available
():
device
=
torch
.
device
(
"
cuda
"
)
print
(
"
device: cuda
"
)
elif
torch
.
backends
.
mps
.
is_available
():
device
=
torch
.
device
(
"
mps
"
)
print
(
"
device: mps
"
)
else
:
device
=
torch
.
device
(
"
cpu
"
)
print
(
"
device: cpu
"
)
```
%% Cell type:code id:8edfacb67dc8c527 tags:
```
python
def
training_data
(
source
:
list
[
str
],
target
:
list
[
str
],
dataset_size
:
int
,
batch_size
:
int
=
64
,
sort
:
bool
=
True
)
->
tuple
[
torch
.
tensor
,
torch
.
tensor
]:
tokenizer_de
.
no_padding
()
tokenizer_en
.
no_padding
()
if
dataset_size
>
len
(
source
):
raise
IndexError
(
"
Dataset size is larger than the source data
"
)
# sort the training data if true
if
sort
:
temp
=
([
list
(
a
)
for
a
in
zip
(
source
[:
dataset_size
],
target
[:
dataset_size
])])
temp
.
sort
(
key
=
lambda
s
:
len
(
s
[
0
])
+
len
(
s
[
1
]))
source
,
target
=
list
(
zip
(
*
temp
))
# select random sentences
for
i
in
range
(
0
,
len
(
source
)
-
batch_size
,
batch_size
):
x_training_data
=
source
[
i
:
i
+
batch_size
]
y_training_data
=
target
[
i
:
i
+
batch_size
]
# tokenize data
tokenizer_en
.
enable_padding
(
pad_id
=
3
)
x_training_data
=
tokenizer_en
.
encode_batch
(
x_training_data
)
tokenizer_de
.
enable_padding
(
pad_id
=
3
)
y_training_data
=
tokenizer_de
.
encode_batch
(
y_training_data
)
# extract ids for every sequence
for
j
in
range
(
batch_size
):
x_training_data
[
j
]
=
x_training_data
[
j
].
ids
y_training_data
[
j
]
=
y_training_data
[
j
].
ids
# put data into tensor
x_training_data
=
torch
.
tensor
(
x_training_data
,
device
=
device
)
y_training_data
=
torch
.
tensor
(
y_training_data
,
device
=
device
)
# transpose tensors to match input requirements for lstm
x_training_data
=
torch
.
transpose
(
x_training_data
,
0
,
1
)
y_training_data
=
torch
.
transpose
(
y_training_data
,
0
,
1
)
yield
x_training_data
,
y_training_data
```
%% Cell type:markdown id:ca6d3d436fd31e33 tags:
### Model Definition
%% Cell type:code id:3b2c4dbc74a1f144 tags:
```
python
# Prepare model
class
Encoder
(
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
embedding_size
:
int
,
hidden_size
:
int
,
num_layers
:
int
):
super
(
Encoder
,
self
).
__init__
()
self
.
num_layers
=
num_layers
self
.
embedding
=
nn
.
Embedding
(
input_size
,
embedding_size
,
device
=
device
)
self
.
rnn
=
nn
.
LSTM
(
input_size
=
embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
device
=
device
)
def
forward
(
self
,
x
):
# shape x : (sequence_len, batch_size)
embedding
=
self
.
embedding
(
x
)
# shape embedding : sequence_len, batch_size, embedding_size)
output
,
(
hidden
,
cell
)
=
self
.
rnn
(
embedding
)
return
hidden
,
cell
class
Decoder
(
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
embedding_size
:
int
,
hidden_size
:
int
,
num_layers
:
int
,
output_size
:
int
):
super
(
Decoder
,
self
).
__init__
()
self
.
num_layers
=
num_layers
self
.
embedding
=
nn
.
Embedding
(
input_size
,
embedding_size
,
device
=
device
)
self
.
rnn
=
nn
.
LSTM
(
input_size
=
embedding_size
,
hidden_size
=
hidden_size
,
num_layers
=
num_layers
,
device
=
device
)
self
.
fc
=
nn
.
Linear
(
hidden_size
,
output_size
,
device
=
device
)
def
forward
(
self
,
x
,
hidden
,
cell
):
x
=
x
.
reshape
(
1
,
-
1
)
# shape x : (1, batch_size)
embedding
=
self
.
embedding
(
x
)
# embedding shape : (1, batch_size, embedding_size)
dec_output
,
(
hidden
,
cell
)
=
self
.
rnn
(
embedding
,
(
hidden
,
cell
))
# shape output : (1, batch_size, hidden_size)
predictions
=
self
.
fc
(
dec_output
)
# shape predictions : (1, batch_size, vocab_len)
predictions
=
predictions
.
squeeze
(
0
)
return
predictions
,
hidden
,
cell
class
Seq2Seq
(
nn
.
Module
):
def
__init__
(
self
,
encoder
:
Encoder
,
decoder
:
Decoder
):
super
(
Seq2Seq
,
self
).
__init__
()
self
.
encoder
=
encoder
self
.
decoder
=
decoder
def
forward
(
self
,
source
,
target
=
None
,
teacher_forcing_ratio
:
float
=
0.5
):
dec_batch_size
=
source
.
shape
[
1
]
target_len
=
target
.
shape
[
0
]
outputs
=
torch
.
zeros
(
target_len
,
dec_batch_size
,
target_vocab_size
,
device
=
device
)
hidden
,
cell
=
self
.
encoder
(
source
)
x
=
target
[
0
]
for
t
in
range
(
1
,
target_len
):
output
,
hidden
,
cell
=
self
.
decoder
(
x
,
hidden
,
cell
)
outputs
[
t
]
=
output
best_guess
=
output
.
argmax
(
1
)
if
target
is
not
None
:
x
=
target
[
t
]
if
random
.
random
()
<
teacher_forcing_ratio
else
best_guess
else
:
x
=
best_guess
return
outputs
# DEBUG
# training hyperparameters
num_epochs
=
50
learning_rate
=
0.001
batch_size
=
64
dataset_size
=
100000
# model hyperparameters
input_size_encoder
=
source_vocab_size
input_size_decoder
=
target_vocab_size
output_size_decoder
=
target_vocab_size
encoder_embedding_size
=
300
decoder_embedding_size
=
300
model_hidden_size
=
1024
model_num_layers
=
2
encoder_net
=
Encoder
(
input_size
=
input_size_encoder
,
embedding_size
=
encoder_embedding_size
,
hidden_size
=
model_hidden_size
,
num_layers
=
model_num_layers
)
decoder_net
=
Decoder
(
input_size
=
input_size_decoder
,
embedding_size
=
decoder_embedding_size
,
hidden_size
=
model_hidden_size
,
num_layers
=
model_num_layers
,
output_size
=
output_size_decoder
)
model
=
Seq2Seq
(
encoder
=
encoder_net
,
decoder
=
decoder_net
)
model
.
train
()
criterion
=
nn
.
CrossEntropyLoss
()
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
learning_rate
)
for
epoch
in
range
(
num_epochs
):
print
(
'
Epoch {}/{}
'
.
format
(
epoch
+
1
,
num_epochs
))
loss_value
=
0
for
batch_idx
,
(
x_train
,
y_train
)
in
enumerate
(
training_data
(
source
=
en
,
target
=
de
,
dataset_size
=
dataset_size
,
batch_size
=
batch_size
)):
optimizer
.
zero_grad
()
predict
=
model
(
x_train
,
y_train
)
predict
=
predict
[
1
:].
reshape
(
-
1
,
predict
.
shape
[
2
])
y_train
=
y_train
[
1
:].
reshape
(
-
1
)
#predict = predict.reshape(-1, predict.shape[2])
#y_train = y_train.reshape(-1)
loss
=
criterion
(
predict
,
y_train
)
loss
.
backward
()
optimizer
.
step
()
loss_value
+=
loss
.
item
()
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
max_norm
=
1
)
print
(
"
loss:
"
+
str
(
loss_value
/
(
dataset_size
/
batch_size
)))
```
%% Output
Epoch 1/50
loss: 3.251950553474426
Epoch 2/50
loss: 2.5886474338912966
Epoch 3/50
loss: 2.2590867259407044
Epoch 4/50
loss: 1.9914781001472472
Epoch 5/50
loss: 1.7620788085746766
Epoch 6/50
loss: 1.526157800474167
Epoch 7/50
loss: 1.3238139695024491
Epoch 8/50
loss: 1.1473209317588806
Epoch 9/50
loss: 0.9837613027739525
Epoch 10/50
loss: 0.8556473323225975
Epoch 11/50
loss: 0.7408420387899876
Epoch 12/50
loss: 0.6378299428713322
Epoch 13/50
loss: 0.5609761751067638
Epoch 14/50
loss: 0.5038776994109154
Epoch 15/50
loss: 0.4588668634200096
Epoch 16/50
loss: 0.41355706704318523
Epoch 17/50
loss: 0.3838378005027771
Epoch 18/50
loss: 0.3672341006922722
Epoch 19/50
loss: 0.3412713519346714
Epoch 20/50
loss: 0.3302113852745295
Epoch 21/50
loss: 0.32545686868429186
Epoch 22/50
loss: 0.31084906596302986
Epoch 23/50
loss: 0.30543883962869645
Epoch 24/50
loss: 0.3089831199032068
Epoch 25/50
loss: 0.29750473050653936
Epoch 26/50
loss: 0.2859128334259987
Epoch 27/50
loss: 0.2887848159578443
Epoch 28/50
loss: 0.2810410741758347
Epoch 29/50
loss: 0.28322928114712237
Epoch 30/50
%% Cell type:markdown id:9854eaee8392caa1 tags:
### Model Parameters
%% Cell type:code id:a0d73467f967ecd9 tags:
```
python
from
prettytable
import
PrettyTable
def
count_parameters
(
model
):
table
=
PrettyTable
([
"
Modules
"
,
"
Parameters
"
])
total_params
=
0
for
name
,
parameter
in
model
.
named_parameters
():
if
not
parameter
.
requires_grad
:
continue
params
=
parameter
.
numel
()
table
.
add_row
([
name
,
params
])
total_params
+=
params
print
(
table
)
print
(
f
"
Total Trainable Params:
{
total_params
}
"
)
return
total_params
count_parameters
(
model
)
```
%% Cell type:markdown id:ea6107f129162137 tags:
### Test the model
%% Cell type:code id:fa8a86342abe0a97 tags:
```
python
# test the model
test_sentence_en
=
"
Can you reach out to me?
"
test_sentence_de
=
"
Kannst du mit mir in Kontakt treten.
"
test_sentence_en_encoded
=
tokenizer_en
.
encode
(
test_sentence_en
)
test_sentence_de_encoded
=
tokenizer_de
.
encode
(
test_sentence_de
)
target_vector
=
torch
.
zeros
(
len
(
test_sentence_de_encoded
.
ids
),
1
)
model
.
eval
()
x_test
=
torch
.
transpose
(
torch
.
tensor
([
test_sentence_en_encoded
.
ids
],
device
=
device
),
0
,
1
)
y_test
=
torch
.
transpose
(
torch
.
tensor
([
test_sentence_de_encoded
.
ids
],
device
=
device
),
0
,
1
)
print
(
y_test
.
shape
)
prediction
=
model
(
x_test
,
y_test
,
teacher_forcing_ratio
=
0.0
)
logits
=
torch
.
nn
.
functional
.
softmax
(
prediction
,
dim
=
2
)
result_ids
=
logits
.
argmax
(
dim
=
2
)
print
(
tokenizer_de
.
decode
(
list
(
result_ids
)))
```
...
...
This diff is collapsed.
Click to expand it.
data/preprocessing.py
0 → 100644
+
93
−
0
View file @
c415e5c6
from
pathlib
import
Path
import
torch
from
tokenizers
import
Tokenizer
from
tokenizers.models
import
BPE
from
tokenizers.trainers
import
BpeTrainer
from
tokenizers.pre_tokenizers
import
Whitespace
from
tokenizers.processors
import
TemplateProcessing
def
get_prepared_data
(
source_data_path
:
str
,
target_data_path
:
str
)
->
tuple
[
list
[
str
],
list
[
str
]]:
with
open
(
source_data_path
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
source_data
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
with
open
(
target_data_path
,
"
r
"
,
encoding
=
"
utf8
"
)
as
f
:
target_data
=
[
line
.
rstrip
(
"
\n
"
)
for
line
in
f
]
ltd
=
set
()
# save lines to delete later
for
i
in
range
(
max
(
len
(
source_data
),
len
(
target_data
))):
# Move sentence to next line if line is empty other file
if
source_data
[
i
]
==
""
:
target_data
[
i
+
1
]
=
target_data
[
i
]
+
"
"
+
target_data
[
i
+
1
]
ltd
.
add
(
i
)
if
target_data
[
i
]
==
""
:
source_data
[
i
+
1
]
=
source_data
[
i
]
+
"
"
+
source_data
[
i
+
1
]
ltd
.
add
(
i
)
# Remove lines, where difference in words is > 40%
if
abs
(
count_words
(
source_data
[
i
])
-
count_words
(
target_data
[
i
]))
/
(
max
(
count_words
(
source_data
[
i
]),
count_words
(
target_data
[
i
]))
+
1
)
>
0.4
:
ltd
.
add
(
i
)
# Remove lines < 3 words or > 10 words
if
max
(
count_words
(
source_data
[
i
]),
count_words
(
target_data
[
i
]))
<
3
or
max
(
count_words
(
source_data
[
i
]),
count_words
(
target_data
[
i
]))
>
10
:
ltd
.
add
(
i
)
temp_source
=
[
l
for
i
,
l
in
enumerate
(
source_data
)
if
i
not
in
ltd
]
source_data
=
temp_source
temp_target
=
[
l
for
i
,
l
in
enumerate
(
target_data
)
if
i
not
in
ltd
]
target_data
=
temp_target
print
(
len
(
source_data
),
len
(
target_data
))
# Print 3 random sentence pairs
ix
=
torch
.
randint
(
low
=
0
,
high
=
max
(
len
(
source_data
),
len
(
target_data
)),
size
=
(
3
,
))
for
i
in
ix
:
print
(
f
"
Zeile:
{
i
}
\n
Deutsch:
{
source_data
[
i
]
}
\n
English:
{
target_data
[
i
]
}
\n
"
)
print
(
f
"
\n
Number of lines:
{
len
(
source_data
),
len
(
target_data
)
}
"
)
return
source_data
,
target_data
def
create_tokenizers
(
source_data_path
:
str
,
target_data_path
:
str
,
source_language
:
str
,
target_language
:
str
):
# setting the unknown token (e.g. for emojis)
tokenizer_en
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
tokenizer_de
=
Tokenizer
(
BPE
(
unk_token
=
"
[UNK]
"
))
# adding special tokens
# [UNK] : unknown word/token
# [CLS] : starting token (new sentence sequence)
# [SEP] : separator for chaining multiple sentences
# [PAD] : padding needed for encoder input
trainer
=
BpeTrainer
(
vocab_size
=
10000
,
special_tokens
=
[
"
[UNK]
"
,
"
[SOS]
"
,
"
[EOS]
"
,
"
[PAD]
"
])
tokenizer_en
.
pre_tokenizer
=
Whitespace
()
tokenizer_de
.
pre_tokenizer
=
Whitespace
()
tokenizer_en
.
train
([
"
data/training-data/eup/europarl-v7.de-en.en
"
],
trainer
)
tokenizer_de
.
train
([
"
data/training-data/eup/europarl-v7.de-en.de
"
],
trainer
)
# configure post processing
tokenizer_en
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_en
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_en
.
token_to_id
(
"
[EOS]
"
)),
],
)
tokenizer_de
.
post_processor
=
TemplateProcessing
(
single
=
"
[SOS] $A [EOS]
"
,
special_tokens
=
[
(
"
[SOS]
"
,
tokenizer_de
.
token_to_id
(
"
[SOS]
"
)),
(
"
[EOS]
"
,
tokenizer_de
.
token_to_id
(
"
[EOS]
"
)),
],
)
workdir
=
Path
(
__file__
).
parent
.
absolute
()
tokenizer_de
.
save
(
str
(
workdir
/
"
tokenizer_de.json
"
))
def
count_words
(
string
:
str
)
->
int
:
return
len
(
string
.
split
())
This diff is collapsed.
Click to expand it.
models/RNN_no_attention_unidirectional.py
+
10
−
0
View file @
c415e5c6
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
import
random
from
tokenizers
import
Tokenizer
from
tokenizers.models
import
BPE
from
tokenizers.trainers
import
BpeTrainer
from
tokenizers.pre_tokenizers
import
Whitespace
from
tokenizers.processors
import
TemplateProcessing
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment