Skip to content
Snippets Groups Projects
Commit 2ddf20a9 authored by Konstantin Julius Lotzgeselle's avatar Konstantin Julius Lotzgeselle :speech_balloon:
Browse files

Fixed loading of pretrained tokenizers.

parent d5779d6a
No related branches found
No related tags found
No related merge requests found
.vscode/settings.json
......@@ -72,10 +72,10 @@ def create_tokenizers(source_data_path: str, target_data_path: str, vocab_size:
# define tokenizer names
tokenizer_source_save_name = (f"tk-{source_data_path.split('/')[-1].replace('.', '-')}-"
f"sptk-{special_tokens if special_tokens is None else ''.join(special_tokens)}-"
f"vcsz-{str(vocab_size)}")
f"vcsz-{str(vocab_size)}.json")
tokenizer_target_save_name = (f"tk-{target_data_path.split('/')[-1].replace('.', '-')}-"
f"sptk-{special_tokens if special_tokens is None else ''.join(special_tokens)}-"
f"vcsz-{str(vocab_size)}")
f"vcsz-{str(vocab_size)}.json")
# setting the special tokens & checking for [UNK] token
if special_tokens is None:
......@@ -85,10 +85,8 @@ def create_tokenizers(source_data_path: str, target_data_path: str, vocab_size:
# check if tokenizer already exists (saved version)
try:
tokenizer_source = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_target = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_source.from_file(str(work_dir / f"tokenizer/{tokenizer_source_save_name}"))
tokenizer_target.from_file(str(work_dir / f"tokenizer/{tokenizer_target_save_name}"))
tokenizer_source = Tokenizer.from_file(str(work_dir / f"tokenizer/{tokenizer_source_save_name}"))
tokenizer_target = Tokenizer.from_file(str(work_dir / f"tokenizer/{tokenizer_target_save_name}"))
return tokenizer_source, tokenizer_target
except:
print("No matching tokenizer found on disk - A new tokenizer will be trained (and saved to disk)")
......
......@@ -11,3 +11,5 @@ dependencies:
- seaborn
- matplotlib
- tokenizers
- progressbar2
- prettytable
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment