Skip to content
Snippets Groups Projects
Commit 95af51fb authored by Konstantin Julius Lotzgeselle's avatar Konstantin Julius Lotzgeselle :speech_balloon:
Browse files

Minor adjustments

parent 7aaa9046
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,10 @@
{
"cell_type": "code",
"execution_count": null,
"id": "f7c39c06ce3a14db",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# split the data into training/dev/test\n",
......@@ -33,25 +37,25 @@
"for id in idx:\n",
" print(id.item())\n",
" print(\"ENG: \" + data_array_en[id.item()] + \"DEU: \" + data_array_de[id.item()])"
],
"metadata": {
"collapsed": false
},
"id": "f7c39c06ce3a14db"
]
},
{
"cell_type": "markdown",
"source": [
"## 1. Text tokenization"
],
"id": "f2beddcc4122495a",
"metadata": {
"collapsed": false
},
"id": "f2beddcc4122495a"
"source": [
"## 1. Text tokenization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8ccbafa97fba573",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# set up the tokenizer\n",
......@@ -74,15 +78,15 @@
"# set up the pre-tokenizer -> this ensures, that the maximal token length is one word\n",
"from tokenizers.pre_tokenizers import Whitespace\n",
"tokenizer.pre_tokenizer = Whitespace()"
],
"metadata": {
"collapsed": false
},
"id": "d8ccbafa97fba573"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55cbac65a50a0199",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tokenizer.train(['./data/training-data/dev/newstest2013.en'], trainer)\n",
......@@ -96,15 +100,15 @@
" (\"[SEP]\", tokenizer.token_to_id(\"[SEP]\")),\n",
" ],\n",
")"
],
"metadata": {
"collapsed": false
},
"id": "55cbac65a50a0199"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "569b9a3425aa5800",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(tokenizer.get_vocab_size())\n",
......@@ -120,25 +124,25 @@
"\n",
"print(test_de.tokens)\n",
"#print(test_de.ids)"
],
"metadata": {
"collapsed": false
},
"id": "569b9a3425aa5800"
]
},
{
"cell_type": "markdown",
"source": [
"## 2. Prepare the training data"
],
"id": "9c0f853775a802ec",
"metadata": {
"collapsed": false
},
"id": "9c0f853775a802ec"
"source": [
"## 2. Prepare the training data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e4dc87ce98b6cdd",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Prepare training batch\n",
......@@ -164,25 +168,29 @@
" return torch.tensor(x_training_data), torch.tensor(y_training_data)\n",
"\n",
"print(training_data())"
],
"metadata": {
"collapsed": false
},
"id": "2e4dc87ce98b6cdd"
]
},
{
"cell_type": "markdown",
"source": [
"## 3. Build the sequence2sequence RNN"
],
"id": "689e2e565cce2845",
"metadata": {
"collapsed": false
},
"id": "689e2e565cce2845"
"source": [
"## 3. Build the sequence2sequence RNN"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "e8d99510479108f4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-06T14:20:39.964393Z",
"start_time": "2024-01-06T14:20:39.859015Z"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
......@@ -708,25 +716,17 @@
"\n",
"train = training_data()\n",
"print(model(train[0]))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-06T14:20:39.964393Z",
"start_time": "2024-01-06T14:20:39.859015Z"
}
},
"id": "e8d99510479108f4"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"id": "1f8d3152359f6658",
"metadata": {
"collapsed": false
},
"id": "1f8d3152359f6658"
"outputs": [],
"source": []
}
],
"metadata": {
......@@ -745,7 +745,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"version": "3.11.7"
}
},
"nbformat": 4,
......
File suppressed by a .gitattributes entry, the file's encoding is unsupported, or the file size exceeds the limit.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment