Skip to content
Snippets Groups Projects
Commit 0dfe5c89 authored by Konstantin Julius Lotzgeselle's avatar Konstantin Julius Lotzgeselle :speech_balloon:
Browse files

Added debug flag to get_prepared_data() function

parent 0f1ffd05
No related branches found
No related tags found
No related merge requests found
......@@ -15,7 +15,8 @@ def get_prepared_data(source_data_path: str,
min_seq_len: int = 1,
max_seq_len: int = 20,
filter: float = 0.5,
seperator_symbol: str = "\n") -> tuple[list[str], list[str]]:
seperator_symbol: str = "\n",
debug: bool = False) -> tuple[list[str], list[str]]:
with open(source_data_path, "r", encoding=encoding, newline=seperator_symbol) as f:
source_data = [line.rstrip("\n") for line in f]
......@@ -52,6 +53,7 @@ def get_prepared_data(source_data_path: str,
target_data = temp_target
print(len(source_data),len(target_data))
if debug:
# Print 3 random sentence pairs
ix = torch.randint(low=0, high=min(len(source_data), len(target_data)), size=(3, ))
for i in ix:
......@@ -64,7 +66,7 @@ def get_prepared_data(source_data_path: str,
def count_words(string: str) -> int:
return len(string.split())
# get_prepared_data("data/tokenizer-data/news-commentary-v11.de", "data/tokenizer-data/news-commentary-v11.en")
def create_tokenizers(source_data_path: str, target_data_path: str, source_language: str, target_language: str):
# setting the unknown token (e.g. for emojis)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment