From c1d362bab23beebfbdf380372d8e05e51dd7bf60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 8 Sep 2023 18:56:35 +0300 Subject: [PATCH 01/68] my first commit --- main.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 000000000..8ea4c734f --- /dev/null +++ b/main.py @@ -0,0 +1 @@ +print ('Hello world!') \ No newline at end of file From ac7d44be03634acd1bae3cd58c6008c800e54add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 8 Sep 2023 18:59:49 +0300 Subject: [PATCH 02/68] my second commit --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 8ea4c734f..7175d0851 100644 --- a/main.py +++ b/main.py @@ -1 +1 @@ -print ('Hello world!') \ No newline at end of file +print ('Hello world!1') \ No newline at end of file From d4d0735fc90cd2d170dc5485b59dfc699a2e3791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sat, 9 Sep 2023 15:12:14 +0300 Subject: [PATCH 03/68] my second commit --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 7175d0851..1f8916570 100644 --- a/main.py +++ b/main.py @@ -1 +1 @@ -print ('Hello world!1') \ No newline at end of file +print ('Hello world!11') \ No newline at end of file From 7b609f5808b3ff82017a41253cafba9457715505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 22 Sep 2023 11:05:35 +0300 Subject: [PATCH 04/68] file deleted --- main.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 main.py diff --git a/main.py b/main.py deleted file mode 100644 index 1f8916570..000000000 --- a/main.py +++ /dev/null @@ -1 +0,0 @@ -print ('Hello world!11') \ No newline at end of file From 833c00a32296383e2f644094e99aa9efe2e66078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 26 Sep 2023 16:14:13 +0300 Subject: [PATCH 05/68] file deleted --- lab_1_classify_by_unigrams/main.py | 11 ++++++++++- lab_1_classify_by_unigrams/start.py | 6 ++++++ lab_1_classify_by_unigrams/target_score.txt | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 486b3d65c..113970407 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -11,7 +11,16 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - + if text is str == False: + return None + else: + list_of_tokens = "" + for token in text: + if token.isalpha(): + new_token = token.lower() + list_of_tokens += new_token + tokens = list(list_of_tokens) + return(tokens) def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index db7a1a904..ffb91de3d 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -2,6 +2,8 @@ Language detection starter """ +import lab_1_classify_by_unigrams.main + def main() -> None: """ @@ -9,10 +11,14 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() + en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text) + print(en_tokens) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() + de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() + unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text) result = None assert result, "Detection result is None" diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 573541ac9..bf0d87ab1 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -0 +4 \ No newline at end of file From 5d2bdbe300c6c8e52c8b88bd87a78b6eeff7d74f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 26 Sep 2023 17:35:31 +0300 Subject: [PATCH 06/68] file deleted --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 9b296ea64..87ff333ee 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -11,7 +11,7 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - if text is str == False: + if isinstance(text, str) == False: return None else: list_of_tokens = "" From 42feb0c0a60cfbf97e562276cb0b8bf7fe4788b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 28 Sep 2023 22:09:44 +0300 Subject: [PATCH 07/68] file deleted --- lab_1_classify_by_unigrams/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 87ff333ee..48d771c3c 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -13,14 +13,14 @@ def tokenize(text: str) -> list[str] | None: """ if isinstance(text, str) == False: return None - else: - list_of_tokens = "" - for token in text: - if token.isalpha(): - new_token = token.lower() - list_of_tokens += new_token - tokens = list(list_of_tokens) - return(tokens) + + list_of_tokens = "" + for token in text: + if token.isalpha(): + new_token = token.lower() + list_of_tokens += new_token + tokens = list(list_of_tokens) + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ From 658e324e003ff299695cdb1fc20dffe0831dadb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 28 Sep 2023 22:18:21 +0300 Subject: [PATCH 08/68] file deleted --- lab_1_classify_by_unigrams/start.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index ffb91de3d..8fffde339 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -19,8 +19,8 @@ def main() -> None: with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text) - result = None - assert result, "Detection result is None" + #result = None + #assert result, "Detection result is None" if __name__ == "__main__": From 6e3a82b96178041eb0e7bcac45f37201d90a441d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 1 Oct 2023 19:16:46 +0300 Subject: [PATCH 09/68] calculated frequencies --- lab_1_classify_by_unigrams/main.py | 10 ++++++++++ lab_1_classify_by_unigrams/start.py | 2 ++ lab_1_classify_by_unigrams/target_score.txt | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 48d771c3c..a36d78581 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -28,6 +28,16 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ + if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens): + return None + list_of_tokens = "" + dict_of_frequencies = {} + for token in tokens: + if token not in list_of_tokens: + list_of_tokens += token + frequency = list_of_tokens.count(token)/len(tokens) + dict_of_frequencies[token] = frequency + return dict_of_frequencies def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 8fffde339..2a6c1a3ba 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -13,6 +13,8 @@ def main() -> None: en_text = file_to_read_en.read() en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text) print(en_tokens) + create_language_profile = lab_1_classify_by_unigrams.main.calculate_frequencies(en_tokens) + print(create_language_profile) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text) diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index bf0d87ab1..62f945751 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -4 \ No newline at end of file +6 \ No newline at end of file From 58f3cd55c3a2d1959479f771dae40f90bed9578a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 1 Oct 2023 20:03:57 +0300 Subject: [PATCH 10/68] profiles created --- lab_1_classify_by_unigrams/main.py | 13 ++++++++++++- lab_1_classify_by_unigrams/start.py | 8 ++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index a36d78581..b8c03ef43 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -11,7 +11,7 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - if isinstance(text, str) == False: + if not isinstance(text, str): return None list_of_tokens = "" @@ -20,6 +20,7 @@ def tokenize(text: str) -> list[str] | None: new_token = token.lower() list_of_tokens += new_token tokens = list(list_of_tokens) + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: @@ -30,6 +31,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens): return None + list_of_tokens = "" dict_of_frequencies = {} for token in tokens: @@ -37,6 +39,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: list_of_tokens += token frequency = list_of_tokens.count(token)/len(tokens) dict_of_frequencies[token] = frequency + return dict_of_frequencies @@ -47,6 +50,14 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st :param text: a text :return: a dictionary with two keys – name, freq """ + if not isinstance(language, str) or not isinstance(text, str): + return None + + tokens = tokenize(text) + frequency = calculate_frequencies(tokens) + lang_profile = {'name': language, 'freq': frequency} + + return lang_profile def calculate_mse(predicted: list, actual: list) -> float | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 2a6c1a3ba..2ea1ae4f7 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -13,14 +13,14 @@ def main() -> None: en_text = file_to_read_en.read() en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text) print(en_tokens) - create_language_profile = lab_1_classify_by_unigrams.main.calculate_frequencies(en_tokens) - print(create_language_profile) + language_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text) + print(language_profile) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text) + language_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text) + language_profile = lab_1_classify_by_unigrams.main.create_language_profile('unk', unknown_text) #result = None #assert result, "Detection result is None" From fc862ffbaac60d6a866b18a0c2dfc9bd298d4d19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 1 Oct 2023 22:11:10 +0300 Subject: [PATCH 11/68] calculate_frequencies fixed --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b8c03ef43..4030dd084 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -37,7 +37,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: for token in tokens: if token not in list_of_tokens: list_of_tokens += token - frequency = list_of_tokens.count(token)/len(tokens) + frequency = tokens.count(token)/len(tokens) dict_of_frequencies[token] = frequency return dict_of_frequencies From b3a235928ea1ca87aa7529c9ecd99ccc12ca985b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 2 Oct 2023 23:34:58 +0300 Subject: [PATCH 12/68] mse calculated --- lab_1_classify_by_unigrams/main.py | 19 +++++++++++++++++++ lab_1_classify_by_unigrams/target_score.txt | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 4030dd084..47b97e1d4 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -67,6 +67,17 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ + if (len(predicted) != len(actual) or + not isinstance(predicted, list) or + not isinstance(actual, list)): + return None + + sum_mse = 0 + for i in range(len(actual)): + sum_mse += (actual[i]-predicted[i]) ** 2 + mse = sum_mse/len(actual) + + return mse def compare_profiles( @@ -79,6 +90,14 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ + if (not isinstance(unknown_profile, dict) or + not isinstance(profile_to_compare, dict)): + return None + if ('name' not in unknown_profile or + 'freq' not in unknown_profile or + 'name' not in profile_to_compare or + 'freq' not in profile_to_compare): + return None def detect_language( diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 62f945751..301160a93 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -6 \ No newline at end of file +8 \ No newline at end of file From c7fdd72747146e40ca9004de65dcb338f5e63423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 2 Oct 2023 23:44:50 +0300 Subject: [PATCH 13/68] corrected conditions in compare_profiles --- lab_1_classify_by_unigrams/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 47b97e1d4..b8dc1cef5 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -93,10 +93,8 @@ def compare_profiles( if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict)): return None - if ('name' not in unknown_profile or - 'freq' not in unknown_profile or - 'name' not in profile_to_compare or - 'freq' not in profile_to_compare): + if (('name' or 'freq') not in unknown_profile or + ('name' or 'freq') not in profile_to_compare): return None From 4e69cd793edb4c6ee2f3847ac34fa7d986781f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 3 Oct 2023 22:23:57 +0300 Subject: [PATCH 14/68] mentor's corrections fixed and some things changed --- lab_1_classify_by_unigrams/main.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b8dc1cef5..d8011a4db 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -17,8 +17,7 @@ def tokenize(text: str) -> list[str] | None: list_of_tokens = "" for token in text: if token.isalpha(): - new_token = token.lower() - list_of_tokens += new_token + list_of_tokens += token.lower() tokens = list(list_of_tokens) return tokens @@ -29,7 +28,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ - if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens): + if not isinstance(tokens, list) or not all(isinstance(token, str) for token in tokens): return None list_of_tokens = "" @@ -53,9 +52,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st if not isinstance(language, str) or not isinstance(text, str): return None - tokens = tokenize(text) - frequency = calculate_frequencies(tokens) - lang_profile = {'name': language, 'freq': frequency} + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} return lang_profile @@ -73,11 +70,10 @@ def calculate_mse(predicted: list, actual: list) -> float | None: return None sum_mse = 0 - for i in range(len(actual)): - sum_mse += (actual[i]-predicted[i]) ** 2 - mse = sum_mse/len(actual) + for i, act_value in enumerate(actual): + sum_mse += (act_value-predicted[i]) ** 2 - return mse + return sum_mse/len(actual) def compare_profiles( From 573c0875695e2fa5087ed6c1013aa2be408e734b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 3 Oct 2023 22:35:50 +0300 Subject: [PATCH 15/68] trying to fix tests --- lab_1_classify_by_unigrams/main.py | 7 ++----- lab_1_classify_by_unigrams/target_score.txt | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index d8011a4db..c66f96cff 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -18,9 +18,8 @@ def tokenize(text: str) -> list[str] | None: for token in text: if token.isalpha(): list_of_tokens += token.lower() - tokens = list(list_of_tokens) - return tokens + return list(list_of_tokens) def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ @@ -52,9 +51,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st if not isinstance(language, str) or not isinstance(text, str): return None - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} - - return lang_profile + return {'name': language, 'freq': calculate_frequencies(tokenize(text))} def calculate_mse(predicted: list, actual: list) -> float | None: diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 301160a93..62f945751 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -8 \ No newline at end of file +6 \ No newline at end of file From 953323bee557a0f505ab1badd9aa88bc7b5dbdfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 5 Oct 2023 18:16:00 +0300 Subject: [PATCH 16/68] tokenize simplified --- lab_1_classify_by_unigrams/main.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index c66f96cff..369cb658b 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -14,12 +14,9 @@ def tokenize(text: str) -> list[str] | None: if not isinstance(text, str): return None - list_of_tokens = "" - for token in text: - if token.isalpha(): - list_of_tokens += token.lower() + list_of_tokens = [token.lower() for token in text if token.isalpha()] - return list(list_of_tokens) + return list_of_tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ From 01ba88ea816547d0ceb79e96fc99de5aef1a0195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 5 Oct 2023 18:32:48 +0300 Subject: [PATCH 17/68] added spaces --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 369cb658b..96a9ef107 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for i, act_value in enumerate(actual): sum_mse += (act_value-predicted[i]) ** 2 - return sum_mse/len(actual) + return sum_mse / len(actual) def compare_profiles( From 965b5e0779525cc8e49d9682bfb06fe917d472ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 5 Oct 2023 18:35:10 +0300 Subject: [PATCH 18/68] added spaces --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 96a9ef107..369cb658b 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for i, act_value in enumerate(actual): sum_mse += (act_value-predicted[i]) ** 2 - return sum_mse / len(actual) + return sum_mse/len(actual) def compare_profiles( From 74636cd10489ffe55c2d2b6fdffc5e39b31da509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 5 Oct 2023 18:35:24 +0300 Subject: [PATCH 19/68] added spaces --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 369cb658b..96a9ef107 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for i, act_value in enumerate(actual): sum_mse += (act_value-predicted[i]) ** 2 - return sum_mse/len(actual) + return sum_mse / len(actual) def compare_profiles( From 5a089b29d6d24a5cc961af76f167ad29dd49901c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 6 Oct 2023 12:23:27 +0300 Subject: [PATCH 20/68] FIXED CALCULATE_FREQUENCIES --- lab_1_classify_by_unigrams/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 96a9ef107..c4184b69c 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -27,11 +27,8 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: if not isinstance(tokens, list) or not all(isinstance(token, str) for token in tokens): return None - list_of_tokens = "" dict_of_frequencies = {} for token in tokens: - if token not in list_of_tokens: - list_of_tokens += token frequency = tokens.count(token)/len(tokens) dict_of_frequencies[token] = frequency From c049b1f78785d440846bc51da0b5ef51da55a620 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Wed, 11 Oct 2023 11:01:19 +0300 Subject: [PATCH 21/68] checkout labs from the origin repository --- lab_1_classify_by_unigrams/main.py | 3 --- lab_1_classify_by_unigrams/start.py | 5 ----- 2 files changed, 8 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 20553c73b..a2d5744f9 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -17,9 +17,6 @@ def tokenize(text: str) -> list[str] | None: return [token.lower() for token in text if token.isalpha()] - list_of_tokens = [token.lower() for token in text if token.isalpha()] - - return list_of_tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 70a5d5613..4a17442d0 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -12,13 +12,8 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text) - print(en_tokens) - language_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text) - print(language_profile) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - language_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() From 16ed42b83f26e6f146b5cc6442918b76a52829ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 18 Oct 2023 15:18:55 +0300 Subject: [PATCH 22/68] code for 4 --- lab_2_tokenize_by_bpe/main.py | 28 +++++++++++++++++++++++++- lab_2_tokenize_by_bpe/start.py | 5 +++-- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 620a4d645..8068fce6e 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -14,7 +14,18 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - + if (not isinstance(raw_word, str) or + not (isinstance(start_of_word, str) or start_of_word is None) or + not (isinstance(end_of_word, str) or end_of_word is None)): + return None + + if not start_of_word and not end_of_word: + return tuple(list(raw_word)) + if not end_of_word: + return tuple([start_of_word] + list(raw_word)) + if not start_of_word: + return tuple(list(raw_word) + [end_of_word]) + return tuple([start_of_word] + list(raw_word) + [end_of_word]) def collect_frequencies( text: str, start_of_word: str | None, end_of_word: str @@ -26,6 +37,21 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ + if (not isinstance(text, str) or + not (isinstance(start_of_word, str) or start_of_word is None) or + not isinstance(end_of_word, str)): + return None + + dict_of_freq = {} + words = text.split() + for word in words: + prepr_word = prepare_word(word, None, '') + if prepr_word is None: + return None + if prepr_word not in dict_of_freq: + dict_of_freq[prepr_word] = words.count(word) + + return dict_of_freq def count_tokens_pairs( diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 798e957e0..04df67fb7 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -2,6 +2,7 @@ BPE Tokenizer starter """ from pathlib import Path +import lab_2_tokenize_by_bpe.main def main() -> None: @@ -12,8 +13,8 @@ def main() -> None: with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - result = None - assert result, "Encoding is not working" + result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') + #assert result, "Encoding is not working" if __name__ == "__main__": diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 573541ac9..b8626c4cf 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -0 +4 From 62fc719bb4113730683d92bf5fd334d9238e465b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 25 Oct 2023 14:26:22 +0300 Subject: [PATCH 23/68] fixed mentor's comments --- lab_2_tokenize_by_bpe/main.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 8068fce6e..166c43495 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -14,9 +14,11 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if (not isinstance(raw_word, str) or - not (isinstance(start_of_word, str) or start_of_word is None) or - not (isinstance(end_of_word, str) or end_of_word is None)): + if not isinstance(raw_word, str): + return None + if not isinstance(start_of_word, str) and start_of_word is not None: + return None + if not isinstance(end_of_word, str) or end_of_word is not None: return None if not start_of_word and not end_of_word: @@ -44,12 +46,11 @@ def collect_frequencies( dict_of_freq = {} words = text.split() - for word in words: - prepr_word = prepare_word(word, None, '') + for word in set(words): + prepr_word = prepare_word(word, start_of_word, end_of_word) if prepr_word is None: return None - if prepr_word not in dict_of_freq: - dict_of_freq[prepr_word] = words.count(word) + dict_of_freq[prepr_word] = words.count(word) return dict_of_freq @@ -62,6 +63,10 @@ def count_tokens_pairs( :param word_frequencies: dictionary in the form of :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict): + return None + + def merge_tokens( From aaa23f39804e85b1f3c4947c1304d8885834e09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 26 Oct 2023 01:20:04 +0300 Subject: [PATCH 24/68] fixed unittests --- lab_2_tokenize_by_bpe/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 166c43495..e3ffe07c8 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -18,7 +18,7 @@ def prepare_word( return None if not isinstance(start_of_word, str) and start_of_word is not None: return None - if not isinstance(end_of_word, str) or end_of_word is not None: + if not isinstance(end_of_word, str) and end_of_word is not None: return None if not start_of_word and not end_of_word: From 2083fa54322b7dd96a8eb5d84b4d729e2d4b6c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 30 Oct 2023 21:07:53 +0300 Subject: [PATCH 25/68] code for 6 --- lab_2_tokenize_by_bpe/main.py | 55 ++++++++++++++++++++++++++++++++++ lab_2_tokenize_by_bpe/start.py | 7 +++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index e3ffe07c8..ba4d17b63 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -66,6 +66,18 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None + result_dic = {} + for pair in word_frequencies.items(): + word = pair[0] + count = pair[1] + for i in range(len(word) - 1): + token1 = word[i] + token2 = word[i + 1] + if not result_dic.get((token1, token2)): + result_dic[(token1, token2)] = 0 + result_dic[(token1, token2)] += count + + return result_dic @@ -78,6 +90,23 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ + if (not isinstance(word_frequencies, dict) or + not isinstance(pair, tuple)): + return None + + new_word_freq = {} + for pairs in word_frequencies.items(): + word = pairs[0] + count = pairs[1] + new_word = [] + for i in range(len(word) - 1): + if word[i] == pair[0] and word[i + 1] == pair[1]: + new_word.append((pair[0] + pair[1])) + else: + new_word.append(word[i]) + new_word_freq[tuple(new_word)] = count + + return new_word_freq def train( @@ -89,6 +118,32 @@ def train( :param num_merges: required number of new tokens :return: dictionary in the form of """ + if (not isinstance(word_frequencies, dict) or + not isinstance(num_merges, int)): + return None + + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: + return None + num_merges = min(num_merges, len(dict_pairs)) + + for iteration in range(num_merges): + max_value = max(dict_pairs.values()) + value_list = [key for key, value in dict_pairs.items() if value == max_value] + + max_len = max(len(''.join(pair)) for pair in value_list) + len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] + + word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) + if word_frequencies is None: + return None + + #dict_pairs.pop(sorted(len_list)[0]) + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: + return None + + return word_frequencies def get_vocabulary( diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 04df67fb7..b8fe5f04a 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -13,8 +13,11 @@ def main() -> None: with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') - #assert result, "Encoding is not working" + word_freq = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') + print(lab_2_tokenize_by_bpe.main.train(word_freq, 100)) + + # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') + # assert result, "Encoding is not working" if __name__ == "__main__": From fab91f409c24d075e7dfdd495c9e999f38ec1104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 30 Oct 2023 21:17:38 +0300 Subject: [PATCH 26/68] forgot to change the score meow --- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index b8626c4cf..1e8b31496 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -4 +6 From 4ff90c203165f41bdf02880184ebc412b256bd87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 30 Oct 2023 21:28:34 +0300 Subject: [PATCH 27/68] fixed style i hope --- lab_2_tokenize_by_bpe/start.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index b8fe5f04a..a7921e08d 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -2,7 +2,7 @@ BPE Tokenizer starter """ from pathlib import Path -import lab_2_tokenize_by_bpe.main +import lab_2_tokenize_by_bpe.main as main_file def main() -> None: @@ -13,8 +13,8 @@ def main() -> None: with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - word_freq = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') - print(lab_2_tokenize_by_bpe.main.train(word_freq, 100)) + word_freq = main_file.collect_frequencies(text, None, '') + print(main_file.train(word_freq, 100)) # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') # assert result, "Encoding is not working" From 9ecb29a766cc40c1120fa0468aa08c07db7931b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 30 Oct 2023 21:36:02 +0300 Subject: [PATCH 28/68] fixed style i hope [2] --- lab_2_tokenize_by_bpe/start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index a7921e08d..fc080fa50 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -2,6 +2,7 @@ BPE Tokenizer starter """ from pathlib import Path + import lab_2_tokenize_by_bpe.main as main_file From 870ab1b8b35431039d0fe87b31d5547a25f6a9e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 31 Oct 2023 19:42:59 +0300 Subject: [PATCH 29/68] fixed part of comments --- lab_2_tokenize_by_bpe/main.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index ba4d17b63..cc3a8b067 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -21,7 +21,7 @@ def prepare_word( if not isinstance(end_of_word, str) and end_of_word is not None: return None - if not start_of_word and not end_of_word: + if not (start_of_word and end_of_word): return tuple(list(raw_word)) if not end_of_word: return tuple([start_of_word] + list(raw_word)) @@ -67,15 +67,13 @@ def count_tokens_pairs( return None result_dic = {} - for pair in word_frequencies.items(): - word = pair[0] - count = pair[1] + for word, freq in word_frequencies.items(): for i in range(len(word) - 1): token1 = word[i] token2 = word[i + 1] if not result_dic.get((token1, token2)): result_dic[(token1, token2)] = 0 - result_dic[(token1, token2)] += count + result_dic[(token1, token2)] += freq return result_dic @@ -95,16 +93,14 @@ def merge_tokens( return None new_word_freq = {} - for pairs in word_frequencies.items(): - word = pairs[0] - count = pairs[1] + for word, freq in word_frequencies.items(): new_word = [] for i in range(len(word) - 1): if word[i] == pair[0] and word[i + 1] == pair[1]: new_word.append((pair[0] + pair[1])) else: new_word.append(word[i]) - new_word_freq[tuple(new_word)] = count + new_word_freq[tuple(new_word)] = freq return new_word_freq From 272d46779c16ed2b4a32fb60ddc4de92113bca73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 1 Nov 2023 23:51:16 +0300 Subject: [PATCH 30/68] trying to fix all............. --- lab_2_tokenize_by_bpe/main.py | 69 ++++++++++++++++++---------------- lab_2_tokenize_by_bpe/start.py | 2 +- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index cc3a8b067..09b8d0857 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -21,13 +21,13 @@ def prepare_word( if not isinstance(end_of_word, str) and end_of_word is not None: return None - if not (start_of_word and end_of_word): - return tuple(list(raw_word)) - if not end_of_word: - return tuple([start_of_word] + list(raw_word)) - if not start_of_word: - return tuple(list(raw_word) + [end_of_word]) - return tuple([start_of_word] + list(raw_word) + [end_of_word]) + tokens = [] + if start_of_word is not None: + tokens.append(start_of_word) + tokens.extend(list(raw_word)) + if end_of_word is not None: + tokens.append(end_of_word) + return tuple(tokens) def collect_frequencies( text: str, start_of_word: str | None, end_of_word: str @@ -46,11 +46,11 @@ def collect_frequencies( dict_of_freq = {} words = text.split() - for word in set(words): - prepr_word = prepare_word(word, start_of_word, end_of_word) - if prepr_word is None: + prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words] + for word in set(prepr_words): + if word is None: return None - dict_of_freq[prepr_word] = words.count(word) + dict_of_freq[word] = prepr_words.count(word) return dict_of_freq @@ -67,13 +67,13 @@ def count_tokens_pairs( return None result_dic = {} - for word, freq in word_frequencies.items(): + for word in word_frequencies: for i in range(len(word) - 1): token1 = word[i] token2 = word[i + 1] if not result_dic.get((token1, token2)): result_dic[(token1, token2)] = 0 - result_dic[(token1, token2)] += freq + result_dic[(token1, token2)] += word_frequencies[word] return result_dic @@ -92,15 +92,20 @@ def merge_tokens( not isinstance(pair, tuple)): return None - new_word_freq = {} + new_word_freq = word_frequencies.copy() for word, freq in word_frequencies.items(): - new_word = [] - for i in range(len(word) - 1): - if word[i] == pair[0] and word[i + 1] == pair[1]: - new_word.append((pair[0] + pair[1])) - else: - new_word.append(word[i]) - new_word_freq[tuple(new_word)] = freq + if pair[0] and pair[1] in word: + new_word = [] + for i in range(len(word) - 1): + if word[i] == pair[1] and word[i - 1] == pair[0]: + pass + elif word[i] == pair[0] and word[i + 1] == pair[1]: + new_word.append((pair[0] + pair[1])) + else: + new_word.append(word[i]) + + value = new_word_freq.pop(word) + new_word_freq[tuple(new_word)] = value return new_word_freq @@ -124,20 +129,20 @@ def train( num_merges = min(num_merges, len(dict_pairs)) for iteration in range(num_merges): - max_value = max(dict_pairs.values()) - value_list = [key for key, value in dict_pairs.items() if value == max_value] + if dict_pairs != {}: + max_value = max(dict_pairs.values()) + value_list = [key for key, value in dict_pairs.items() if value == max_value] - max_len = max(len(''.join(pair)) for pair in value_list) - len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] + max_len = max(len(''.join(pair)) for pair in value_list) + len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] - word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) - if word_frequencies is None: - return None + word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) + if word_frequencies is None: + return None - #dict_pairs.pop(sorted(len_list)[0]) - dict_pairs = count_tokens_pairs(word_frequencies) - if dict_pairs is None: - return None + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: + return None return word_frequencies diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index fc080fa50..33299b934 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -17,7 +17,7 @@ def main() -> None: word_freq = main_file.collect_frequencies(text, None, '') print(main_file.train(word_freq, 100)) - # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '') + # result = main_file.train(word_freq, 100) # assert result, "Encoding is not working" From 82363ad60dc6a4f238fabb2d4bddf42121983d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 2 Nov 2023 12:31:02 +0300 Subject: [PATCH 31/68] trying to fix all............. --- lab_2_tokenize_by_bpe/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 09b8d0857..896520443 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -93,9 +93,9 @@ def merge_tokens( return None new_word_freq = word_frequencies.copy() - for word, freq in word_frequencies.items(): - if pair[0] and pair[1] in word: - new_word = [] + for word in word_frequencies: + new_word = [] + if pair[0] in word and pair[1] in word: for i in range(len(word) - 1): if word[i] == pair[1] and word[i - 1] == pair[0]: pass From 454a136f63eeedd6527979193e7baff7281c0536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 3 Nov 2023 00:43:45 +0300 Subject: [PATCH 32/68] fixed merge tokens --- lab_2_tokenize_by_bpe/main.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 896520443..bf22fffe9 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -69,11 +69,10 @@ def count_tokens_pairs( result_dic = {} for word in word_frequencies: for i in range(len(word) - 1): - token1 = word[i] - token2 = word[i + 1] - if not result_dic.get((token1, token2)): - result_dic[(token1, token2)] = 0 - result_dic[(token1, token2)] += word_frequencies[word] + pair = word[i:i + 2] + if not result_dic.get(pair): + result_dic[pair] = 0 + result_dic[pair] += word_frequencies[word] return result_dic @@ -92,20 +91,17 @@ def merge_tokens( not isinstance(pair, tuple)): return None - new_word_freq = word_frequencies.copy() + new_word_freq = {} for word in word_frequencies: - new_word = [] + new_word = list(word) if pair[0] in word and pair[1] in word: for i in range(len(word) - 1): - if word[i] == pair[1] and word[i - 1] == pair[0]: - pass - elif word[i] == pair[0] and word[i + 1] == pair[1]: - new_word.append((pair[0] + pair[1])) - else: - new_word.append(word[i]) - - value = new_word_freq.pop(word) - new_word_freq[tuple(new_word)] = value + current_pair = tuple([word[i], word[i+1]]) + if current_pair == pair: + new_word.pop(i+1) + new_word[i] = pair[0] + pair[1] + + new_word_freq[tuple(new_word)] = word_frequencies[word] return new_word_freq From 4dd99dad76d229796a767fb4d450f59cccf7fe65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 3 Nov 2023 00:49:28 +0300 Subject: [PATCH 33/68] Artem Mikhailovich do not worry please --- lab_2_tokenize_by_bpe/start.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 33299b934..77333bbf1 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,10 +15,9 @@ def main() -> None: text = text_file.read() word_freq = main_file.collect_frequencies(text, None, '') - print(main_file.train(word_freq, 100)) - # result = main_file.train(word_freq, 100) - # assert result, "Encoding is not working" + result = main_file.train(word_freq, 100) + assert result, "Encoding is not working" if __name__ == "__main__": From f6e49a98c5034dd6ac767a6f01ad18395621e099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 3 Nov 2023 00:55:17 +0300 Subject: [PATCH 34/68] no more bad check --- lab_2_tokenize_by_bpe/main.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index bf22fffe9..47d23bc8a 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -125,20 +125,19 @@ def train( num_merges = min(num_merges, len(dict_pairs)) for iteration in range(num_merges): - if dict_pairs != {}: - max_value = max(dict_pairs.values()) - value_list = [key for key, value in dict_pairs.items() if value == max_value] + max_value = max(dict_pairs.values()) + value_list = [key for key, value in dict_pairs.items() if value == max_value] - max_len = max(len(''.join(pair)) for pair in value_list) - len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] + max_len = max(len(''.join(pair)) for pair in value_list) + len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] - word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) - if word_frequencies is None: - return None + word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) + if word_frequencies is None: + return None - dict_pairs = count_tokens_pairs(word_frequencies) - if dict_pairs is None: - return None + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: + return None return word_frequencies From 1cfb086ec0ade7a64d552968c7955c5c1963fa04 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:12:59 +0300 Subject: [PATCH 35/68] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 253 +++++-------------------- lab_2_tokenize_by_bpe/start.py | 36 +--- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 55 insertions(+), 236 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 213e455f6..47d23bc8a 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -2,8 +2,6 @@ Lab 2 BPE and machine translation evaluation """ -import json -import math def prepare_word( @@ -16,16 +14,12 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if not isinstance(raw_word, str) or not (isinstance( - start_of_word, str) or start_of_word is None) or not ( - isinstance(end_of_word, str) or end_of_word is None): + if not isinstance(raw_word, str): + return None + if not isinstance(start_of_word, str) and start_of_word is not None: + return None + if not isinstance(end_of_word, str) and end_of_word is not None: return None - list_of_tokens = list(raw_word) - if end_of_word: - list_of_tokens.append(end_of_word) - if start_of_word: - list_of_tokens.insert(0, start_of_word) - return tuple(list_of_tokens) tokens = [] if start_of_word is not None: @@ -45,20 +39,20 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ - if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( - isinstance(start_of_word, str) or start_of_word is None): + if (not isinstance(text, str) or + not (isinstance(start_of_word, str) or start_of_word is None) or + not isinstance(end_of_word, str)): return None - dict_frequencies = {} - - splitted_text = text.split() - for i in set(splitted_text): - word = prepare_word(i, start_of_word, end_of_word) - if not word: + dict_of_freq = {} + words = text.split() + prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words] + for word in set(prepr_words): + if word is None: return None - dict_frequencies[word] = splitted_text.count(i) + dict_of_freq[word] = prepr_words.count(word) - return dict_frequencies + return dict_of_freq def count_tokens_pairs( @@ -72,16 +66,16 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None - dict_with_pairs = {} - + result_dic = {} for word in word_frequencies: - for index in range(len(word) - 1): - pair = (word[index], word[index + 1]) - if pair not in dict_with_pairs: - dict_with_pairs[pair] = 0 - dict_with_pairs[pair] += word_frequencies[word] + for i in range(len(word) - 1): + pair = word[i:i + 2] + if not result_dic.get(pair): + result_dic[pair] = 0 + result_dic[pair] += word_frequencies[word] + + return result_dic - return dict_with_pairs def merge_tokens( @@ -93,24 +87,23 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ - if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): + if (not isinstance(word_frequencies, dict) or + not isinstance(pair, tuple)): return None - dict_merged_tokens = {} - for i in word_frequencies: - list_word = list(i) - for index in range(len(list_word) - 1): - if (i[index], i[index + 1]) == pair: - list_word[index + 1] = pair[0] + pair[1] - list_word[index] = '' + new_word_freq = {} + for word in word_frequencies: + new_word = list(word) + if pair[0] in word and pair[1] in word: + for i in range(len(word) - 1): + current_pair = tuple([word[i], word[i+1]]) + if current_pair == pair: + new_word.pop(i+1) + new_word[i] = pair[0] + pair[1] - if '' in list_word: - list_word.remove('') - dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) - else: - dict_merged_tokens.update({i: word_frequencies[i]}) + new_word_freq[tuple(new_word)] = word_frequencies[word] - return dict_merged_tokens + return new_word_freq def train( @@ -122,31 +115,28 @@ def train( :param num_merges: required number of new tokens :return: dictionary in the form of """ - if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): + if (not isinstance(word_frequencies, dict) or + not isinstance(num_merges, int)): return None - dict_with_pairs = count_tokens_pairs(word_frequencies) - if not dict_with_pairs: + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: return None - merges = min(num_merges, len(dict_with_pairs)) - - for i in range(merges): + num_merges = min(num_merges, len(dict_pairs)) - max_values = max(dict_with_pairs.values()) - pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] + for iteration in range(num_merges): + max_value = max(dict_pairs.values()) + value_list = [key for key, value in dict_pairs.items() if value == max_value] - max_len = max(len(str(pair)) for pair in pairs_max_values) - pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] + max_len = max(len(''.join(pair)) for pair in value_list) + len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] - sorted_pairs = sorted(pairs_max_len) - word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) - - if not word_frequencies: + word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) + if word_frequencies is None: return None - dict_with_pairs = count_tokens_pairs(word_frequencies) - - if not dict_with_pairs: + dict_pairs = count_tokens_pairs(word_frequencies) + if dict_pairs is None: return None return word_frequencies @@ -161,26 +151,6 @@ def get_vocabulary( :param unknown_token: a token to signify an unknown token :return: dictionary in the form of """ - if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): - return None - - dict_ident = {} - unique_tokens = set() - - for tuple_tokens in word_frequencies.keys(): - for word in tuple_tokens: - unique_tokens.update(tuple_tokens, word) - - unique_tokens.add(unknown_token) - lex_sorted = sorted(unique_tokens) - len_sorted = sorted(lex_sorted, key=len, reverse=True) - index = 0 - - for token in len_sorted: - dict_ident[token] = index - index += 1 - - return dict_ident def decode( @@ -193,20 +163,6 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( - end_of_word_token, str) or end_of_word_token is None): - return None - decoded = '' - for identifier in encoded_text: - token_list = [key for key in vocabulary if vocabulary[key] == identifier] - - for token in token_list: - decoded += token - - if end_of_word_token: - decoded = decoded.replace(end_of_word_token, ' ') - - return decoded def tokenize_word( @@ -220,27 +176,6 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( - end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): - return None - - word_copy = ''.join(word) - sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) - result = [] - - for key in sorted_vocabulary: - while key in word_copy: - index = word_copy.count(' ', 0, word_copy.find(key)) - result.insert(index, vocabulary[key]) - word_copy = word_copy.replace(key, ' ', 1) - - for unk in word_copy: - if unk != ' ': - index = word_copy.find(unk) - word_copy = word_copy.replace(unk, ' ') - result.insert(index, vocabulary[unknown_token]) - - return result def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -249,16 +184,6 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: :param vocab_path: path to the saved vocabulary :return: dictionary in the form of """ - if not isinstance(vocab_path, str): - return None - - with open(vocab_path, 'r', encoding='utf-8') as f: - vocab = json.load(f) - - if not isinstance(vocab, dict): - return None - - return vocab def encode( @@ -277,26 +202,6 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(original_text, str) or not isinstance( - vocabulary, dict) or not (isinstance( - start_of_word_token, str) or start_of_word_token is None) or not (isinstance( - end_of_word_token, str) or end_of_word_token is None) or not isinstance( - unknown_token, str): - return None - - encoded = [] - split_text = original_text.split() - - for word in split_text: - prepared = prepare_word(word, start_of_word_token, end_of_word_token) - if not prepared: - return None - result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) - if not result: - return None - encoded.extend(result) - - return encoded def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -306,14 +211,6 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: :param order: required number of elements in a single n-gram :return: sequence of n-grams """ - if not isinstance(text, str) or not isinstance(order, int): - return None - - n_grams = [] - for index in range(len(text) + 1 - order): - n_grams.append(tuple(text[index: index + order])) - - return n_grams def calculate_precision( @@ -325,17 +222,6 @@ def calculate_precision( :param reference: expected sequence of n-grams :return: value of Precision metric """ - if not isinstance(actual, list) or not isinstance(reference, list): - return None - - unique_ngrams = set(reference) - matches = 0 - - for n_gram in unique_ngrams: - if n_gram in actual: - matches += 1 - - return matches / len(unique_ngrams) def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -345,17 +231,6 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: :param max_order: maximum length of n-gram considered :return: value of geometric mean of Precision metric """ - if not isinstance(precisions, list) or not isinstance(max_order, int): - return None - - summation = float(0) - - for order in range(max_order): - if precisions[order] < 0: - return 0 - summation += math.log(precisions[order]) - - return math.exp(1 / max_order * summation) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -366,31 +241,3 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ - if not isinstance(actual, str) or not isinstance( - reference, str) or max_order != 3: - return None - - actual_ngrams = [] - reference_ngrams = [] - - for order in range(max_order): - actual_ngram = collect_ngrams(actual, order + 1) - reference_ngram = collect_ngrams(reference, order + 1) - if actual_ngram is None or reference_ngram is None: - return None - actual_ngrams.append(actual_ngram) - reference_ngrams.append(reference_ngram) - - precisions = [] - - for i, j in zip(actual_ngrams, reference_ngrams): - precision = calculate_precision(i, j) - if precision is None: - return None - precisions.append(precision) - - average = geo_mean(precisions, max_order) - if average is None: - return None - - return average * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index d71b1c9c4..77333bbf1 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -1,11 +1,9 @@ """ BPE Tokenizer starter """ -import json from pathlib import Path -from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, - get_vocabulary, train) +import lab_2_tokenize_by_bpe.main as main_file def main() -> None: @@ -15,37 +13,11 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: - encoded_secret = text_file.read() - dict_frequencies = collect_frequencies(text, None, '') - merged_tokens = train(dict_frequencies, 100) - if merged_tokens: - vocabulary = get_vocabulary(merged_tokens, '') - secret = [int(num) for num in encoded_secret.split()] - result = decode(secret, vocabulary, '') - print(result) - assert result, "Encoding is not working" - with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: - predicted = file.read() - with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: - vocabulary = json.load(file) - with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: - actual = file.read() + word_freq = main_file.collect_frequencies(text, None, '') - if [int(token) for token in actual.split()] == encode( - predicted, vocabulary, '\u2581', None, ''): - print("Encoding is successful!") - - with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: - encoded_en = file.read() - with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: - decoded_en = file.read() - - decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) - decoded = decoded.replace('\u2581', ' ') - - print(calculate_bleu(decoded, decoded_en)) + result = main_file.train(word_freq, 100) + assert result, "Encoding is not working" if __name__ == "__main__": diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index f599e28b8..1e8b31496 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -10 +6 From accf69ee5a029fa274ebb3412e2c982f585b26ca Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:23:12 +0300 Subject: [PATCH 36/68] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 260 +++++++++++++++++++------ lab_2_tokenize_by_bpe/start.py | 36 +++- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 236 insertions(+), 62 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 47d23bc8a..19a72913f 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -2,6 +2,8 @@ Lab 2 BPE and machine translation evaluation """ +import json +import math def prepare_word( @@ -14,20 +16,17 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if not isinstance(raw_word, str): - return None - if not isinstance(start_of_word, str) and start_of_word is not None: - return None - if not isinstance(end_of_word, str) and end_of_word is not None: + if not isinstance(raw_word, str) or not (isinstance( + start_of_word, str) or start_of_word is None) or not ( + isinstance(end_of_word, str) or end_of_word is None): return None + list_of_tokens = list(raw_word) + if end_of_word: + list_of_tokens.append(end_of_word) + if start_of_word: + list_of_tokens.insert(0, start_of_word) + return tuple(list_of_tokens) - tokens = [] - if start_of_word is not None: - tokens.append(start_of_word) - tokens.extend(list(raw_word)) - if end_of_word is not None: - tokens.append(end_of_word) - return tuple(tokens) def collect_frequencies( text: str, start_of_word: str | None, end_of_word: str @@ -39,20 +38,20 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ - if (not isinstance(text, str) or - not (isinstance(start_of_word, str) or start_of_word is None) or - not isinstance(end_of_word, str)): + if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( + isinstance(start_of_word, str) or start_of_word is None): return None - dict_of_freq = {} - words = text.split() - prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words] - for word in set(prepr_words): - if word is None: + dict_frequencies = {} + + splitted_text = text.split() + for i in set(splitted_text): + word = prepare_word(i, start_of_word, end_of_word) + if not word: return None - dict_of_freq[word] = prepr_words.count(word) + dict_frequencies[word] = splitted_text.count(i) - return dict_of_freq + return dict_frequencies def count_tokens_pairs( @@ -66,16 +65,16 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None - result_dic = {} - for word in word_frequencies: - for i in range(len(word) - 1): - pair = word[i:i + 2] - if not result_dic.get(pair): - result_dic[pair] = 0 - result_dic[pair] += word_frequencies[word] + dict_with_pairs = {} - return result_dic + for word in word_frequencies: + for index in range(len(word) - 1): + pair = (word[index], word[index + 1]) + if pair not in dict_with_pairs: + dict_with_pairs[pair] = 0 + dict_with_pairs[pair] += word_frequencies[word] + return dict_with_pairs def merge_tokens( @@ -87,23 +86,24 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ - if (not isinstance(word_frequencies, dict) or - not isinstance(pair, tuple)): + if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): return None + dict_merged_tokens = {} + for i in word_frequencies: + list_word = list(i) - new_word_freq = {} - for word in word_frequencies: - new_word = list(word) - if pair[0] in word and pair[1] in word: - for i in range(len(word) - 1): - current_pair = tuple([word[i], word[i+1]]) - if current_pair == pair: - new_word.pop(i+1) - new_word[i] = pair[0] + pair[1] + for index in range(len(list_word) - 1): + if (i[index], i[index + 1]) == pair: + list_word[index + 1] = pair[0] + pair[1] + list_word[index] = '' - new_word_freq[tuple(new_word)] = word_frequencies[word] + if '' in list_word: + list_word.remove('') + dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) + else: + dict_merged_tokens.update({i: word_frequencies[i]}) - return new_word_freq + return dict_merged_tokens def train( @@ -115,28 +115,31 @@ def train( :param num_merges: required number of new tokens :return: dictionary in the form of """ - if (not isinstance(word_frequencies, dict) or - not isinstance(num_merges, int)): + if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None + dict_with_pairs = count_tokens_pairs(word_frequencies) - dict_pairs = count_tokens_pairs(word_frequencies) - if dict_pairs is None: + if not dict_with_pairs: return None - num_merges = min(num_merges, len(dict_pairs)) + merges = min(num_merges, len(dict_with_pairs)) + + for i in range(merges): - for iteration in range(num_merges): - max_value = max(dict_pairs.values()) - value_list = [key for key, value in dict_pairs.items() if value == max_value] + max_values = max(dict_with_pairs.values()) + pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] - max_len = max(len(''.join(pair)) for pair in value_list) - len_list = [pair for pair in value_list if len(''.join(pair)) == max_len] + max_len = max(len(str(pair)) for pair in pairs_max_values) + pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] - word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0]) - if word_frequencies is None: + sorted_pairs = sorted(pairs_max_len) + word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) + + if not word_frequencies: return None - dict_pairs = count_tokens_pairs(word_frequencies) - if dict_pairs is None: + dict_with_pairs = count_tokens_pairs(word_frequencies) + + if not dict_with_pairs: return None return word_frequencies @@ -151,6 +154,26 @@ def get_vocabulary( :param unknown_token: a token to signify an unknown token :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): + return None + + dict_ident = {} + unique_tokens = set() + + for tuple_tokens in word_frequencies.keys(): + for word in tuple_tokens: + unique_tokens.update(tuple_tokens, word) + + unique_tokens.add(unknown_token) + lex_sorted = sorted(unique_tokens) + len_sorted = sorted(lex_sorted, key=len, reverse=True) + index = 0 + + for token in len_sorted: + dict_ident[token] = index + index += 1 + + return dict_ident def decode( @@ -163,6 +186,20 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ + if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None): + return None + decoded = '' + for identifier in encoded_text: + token_list = [key for key in vocabulary if vocabulary[key] == identifier] + + for token in token_list: + decoded += token + + if end_of_word_token: + decoded = decoded.replace(end_of_word_token, ' ') + + return decoded def tokenize_word( @@ -176,6 +213,27 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): + return None + + word_copy = ''.join(word) + sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) + result = [] + + for key in sorted_vocabulary: + while key in word_copy: + index = word_copy.count(' ', 0, word_copy.find(key)) + result.insert(index, vocabulary[key]) + word_copy = word_copy.replace(key, ' ', 1) + + for unk in word_copy: + if unk != ' ': + index = word_copy.find(unk) + word_copy = word_copy.replace(unk, ' ') + result.insert(index, vocabulary[unknown_token]) + + return result def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -184,6 +242,16 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: :param vocab_path: path to the saved vocabulary :return: dictionary in the form of """ + if not isinstance(vocab_path, str): + return None + + with open(vocab_path, 'r', encoding='utf-8') as f: + vocab = json.load(f) + + if not isinstance(vocab, dict): + return None + + return vocab def encode( @@ -202,6 +270,26 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if not isinstance(original_text, str) or not isinstance( + vocabulary, dict) or not (isinstance( + start_of_word_token, str) or start_of_word_token is None) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None) or not isinstance( + unknown_token, str): + return None + + encoded = [] + split_text = original_text.split() + + for word in split_text: + prepared = prepare_word(word, start_of_word_token, end_of_word_token) + if not prepared: + return None + result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) + if not result: + return None + encoded.extend(result) + + return encoded def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -211,6 +299,14 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: :param order: required number of elements in a single n-gram :return: sequence of n-grams """ + if not isinstance(text, str) or not isinstance(order, int): + return None + + n_grams = [] + for index in range(len(text) + 1 - order): + n_grams.append(tuple(text[index: index + order])) + + return n_grams def calculate_precision( @@ -222,6 +318,17 @@ def calculate_precision( :param reference: expected sequence of n-grams :return: value of Precision metric """ + if not isinstance(actual, list) or not isinstance(reference, list): + return None + + unique_ngrams = set(reference) + matches = 0 + + for n_gram in unique_ngrams: + if n_gram in actual: + matches += 1 + + return matches / len(unique_ngrams) def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -231,6 +338,17 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: :param max_order: maximum length of n-gram considered :return: value of geometric mean of Precision metric """ + if not isinstance(precisions, list) or not isinstance(max_order, int): + return None + + summation = float(0) + + for order in range(max_order): + if precisions[order] < 0: + return 0 + summation += math.log(precisions[order]) + + return math.exp(1 / max_order * summation) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -241,3 +359,31 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ + if not isinstance(actual, str) or not isinstance( + reference, str) or max_order != 3: + return None + + actual_ngrams = [] + reference_ngrams = [] + + for order in range(max_order): + actual_ngram = collect_ngrams(actual, order + 1) + reference_ngram = collect_ngrams(reference, order + 1) + if actual_ngram is None or reference_ngram is None: + return None + actual_ngrams.append(actual_ngram) + reference_ngrams.append(reference_ngram) + + precisions = [] + + for i, j in zip(actual_ngrams, reference_ngrams): + precision = calculate_precision(i, j) + if precision is None: + return None + precisions.append(precision) + + average = geo_mean(precisions, max_order) + if average is None: + return None + + return average * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 77333bbf1..d71b1c9c4 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -1,9 +1,11 @@ """ BPE Tokenizer starter """ +import json from pathlib import Path -import lab_2_tokenize_by_bpe.main as main_file +from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, + get_vocabulary, train) def main() -> None: @@ -13,11 +15,37 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() + with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: + encoded_secret = text_file.read() + dict_frequencies = collect_frequencies(text, None, '') + merged_tokens = train(dict_frequencies, 100) + if merged_tokens: + vocabulary = get_vocabulary(merged_tokens, '') + secret = [int(num) for num in encoded_secret.split()] + result = decode(secret, vocabulary, '') + print(result) + assert result, "Encoding is not working" - word_freq = main_file.collect_frequencies(text, None, '') + with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: + predicted = file.read() + with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: + vocabulary = json.load(file) + with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: + actual = file.read() - result = main_file.train(word_freq, 100) - assert result, "Encoding is not working" + if [int(token) for token in actual.split()] == encode( + predicted, vocabulary, '\u2581', None, ''): + print("Encoding is successful!") + + with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: + encoded_en = file.read() + with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: + decoded_en = file.read() + + decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) + decoded = decoded.replace('\u2581', ' ') + + print(calculate_bleu(decoded, decoded_en)) if __name__ == "__main__": diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 1e8b31496..f599e28b8 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -6 +10 From 5cd71b6e404f0e679c4f5214bc83b57e0c228488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 15 Nov 2023 21:16:07 +0300 Subject: [PATCH 37/68] code for 4 --- lab_3_generate_by_ngrams/main.py | 96 +++++++++++++++++++++++ lab_3_generate_by_ngrams/start.py | 7 +- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 103 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index dcf4e8af9..e909e7039 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -23,6 +23,8 @@ def __init__(self, end_of_word_token: str) -> None: Args: end_of_word_token (str): A token denoting word boundary """ + self._end_of_word_token = end_of_word_token + self._storage = {self._end_of_word_token: 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ @@ -41,6 +43,22 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str) or len(text) == 0: + return None + + tokenized_text = [] + for element in text.lower(): + if element.isalpha(): + tokenized_text.append(element) + elif element.isspace() and tokenized_text[-1] != self._end_of_word_token: + tokenized_text.append(self._end_of_word_token) + if not tokenized_text[-1].isalnum(): + tokenized_text.append(self._end_of_word_token) + + if len(tokenized_text) == 0: + return None + + return tuple(tokenized_text) def get_id(self, element: str) -> Optional[int]: """ @@ -55,6 +73,10 @@ def get_id(self, element: str) -> Optional[int]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element, str) or element not in self._storage: + return None + + return self._storage[element] def get_end_of_word_token(self) -> str: """ @@ -63,6 +85,7 @@ def get_end_of_word_token(self) -> str: Returns: str: EoW token """ + return self._end_of_word_token def get_token(self, element_id: int) -> Optional[str]: """ @@ -76,6 +99,12 @@ def get_token(self, element_id: int) -> Optional[str]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element_id, str) or element_id not in self._storage.values(): + return None + + for token, ident in self._storage.items(): + if element_id == ident: + return token def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -93,6 +122,26 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str) or len(text) == 0: + return None + + tokenized_text = self._tokenize(text) + if tokenized_text is None: + return None + + for token in tokenized_text: + self._put(token) + if self._put(token) is None: + return None + + encoded_corpus = [] + for token in tokenized_text: + if self.get_id(token) is None: + return None + else: + encoded_corpus.append(self.get_id(token)) + + return tuple(encoded_corpus) def _put(self, element: str) -> None: """ @@ -104,6 +153,14 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ + if not isinstance(element, str) or len(element) != 1: + return None + + if element not in self._storage: + self._storage[element] = len(self._storage) + + return None + def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -121,6 +178,18 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + return None + + decoded_corpus = self._decode(encoded_corpus) + if decoded_corpus is None: + return None + + resulting_text = self._postprocess_decoded_text(decoded_corpus) + if resulting_text is None: + return None + + return resulting_text def fill_from_ngrams(self, content: dict) -> None: """ @@ -143,6 +212,18 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(corpus, tuple) or len(corpus) == 0: + return None + + decoded_corpus = [] + for ident in corpus: + if not isinstance(ident, int): + return None + if self.get_token(ident) is None: + return None + decoded_corpus.append(self.get_token(ident)) + + return tuple(decoded_corpus) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: """ @@ -159,6 +240,21 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional In case of corrupt input arguments, None is returned """ + if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + return None + + resulting_text = "" + for token in decoded_corpus: + if decoded_corpus[0]: + resulting_text += token.upper() + elif token == self._end_of_word_token: + resulting_text += " " + else: + resulting_text += token + resulting_text.replace(resulting_text[-1], ".") + + return resulting_text + class NGramLanguageModel: diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index b9bcbd999..d51d8fb59 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -2,6 +2,8 @@ Generation by NGrams starter """ +import lab_3_generate_by_ngrams.main as main_py + def main() -> None: """ @@ -11,7 +13,10 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None + text_processor = main_py.TextProcessor('_') + encoded_text = text_processor.encode(text) + decoded_text = text_processor.decode(encoded_text) + result = decoded_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 573541ac9..b8626c4cf 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +4 From 79ff708844074417429ec0f747b7a7e04e01857c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sat, 18 Nov 2023 18:35:05 +0300 Subject: [PATCH 38/68] fixed tests --- lab_3_generate_by_ngrams/main.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index e909e7039..6e9cdadbe 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -23,7 +23,7 @@ def __init__(self, end_of_word_token: str) -> None: Args: end_of_word_token (str): A token denoting word boundary """ - self._end_of_word_token = end_of_word_token + self._end_of_word_token = end_of_word_token self._storage = {self._end_of_word_token: 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: @@ -52,7 +52,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: tokenized_text.append(element) elif element.isspace() and tokenized_text[-1] != self._end_of_word_token: tokenized_text.append(self._end_of_word_token) - if not tokenized_text[-1].isalnum(): + if not text[-1].isalnum(): tokenized_text.append(self._end_of_word_token) if len(tokenized_text) == 0: @@ -99,7 +99,7 @@ def get_token(self, element_id: int) -> Optional[str]: In case of corrupt input arguments or arguments not included in storage, None is returned """ - if not isinstance(element_id, str) or element_id not in self._storage.values(): + if not isinstance(element_id, int) or element_id not in self._storage.values(): return None for token, ident in self._storage.items(): @@ -131,8 +131,6 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: for token in tokenized_text: self._put(token) - if self._put(token) is None: - return None encoded_corpus = [] for token in tokenized_text: @@ -244,14 +242,17 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional return None resulting_text = "" - for token in decoded_corpus: - if decoded_corpus[0]: + for index, token in enumerate(decoded_corpus): + if index == 0: resulting_text += token.upper() elif token == self._end_of_word_token: - resulting_text += " " + if index == len(decoded_corpus) - 1: + resulting_text += "." + else: + resulting_text += " " else: resulting_text += token - resulting_text.replace(resulting_text[-1], ".") + # resulting_text.replace(resulting_text[-1], ".") return resulting_text From 7908ca505032eb79d3c8ca7ad74aaf712f0cbbc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sat, 18 Nov 2023 21:24:21 +0300 Subject: [PATCH 39/68] code dor 6 --- lab_3_generate_by_ngrams/main.py | 71 +++++++++++++++++++++++ lab_3_generate_by_ngrams/start.py | 5 +- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 6e9cdadbe..b7b5e99e6 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -276,6 +276,9 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None: encoded_corpus (tuple): Encoded text n_gram_size (int): A size of n-grams to use for language modelling """ + self._encoded_corpus = encoded_corpus + self._n_gram_size = n_gram_size + self._n_gram_frequencies = {} def get_n_gram_size(self) -> int: """ @@ -284,6 +287,7 @@ def get_n_gram_size(self) -> int: Returns: int: Size of stored n_grams """ + return self._n_gram_size def set_n_grams(self, frequencies: dict) -> None: """ @@ -305,6 +309,21 @@ def build(self) -> int: In case of corrupt input arguments or methods used return None, 1 is returned """ + if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0: + return 1 + + n_grams = self._extract_n_grams(self._encoded_corpus) + if not isinstance(n_grams, tuple) or n_grams is None: + return 1 + + for ngram in set(n_grams): + if not isinstance(ngram, tuple): + return 1 + p_w_1_2 = n_grams.count(ngram) + p_w_1 = len([context for context in n_grams if context[:-1] == ngram[:-1]]) + self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1 + + return 0 def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: """ @@ -318,6 +337,17 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ + if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1: + return None + + possible_tokens = {} + + context = sequence[-(self._n_gram_size - 1)::] + for ngram in self._n_gram_frequencies: + if ngram[:self._n_gram_size - 1] == context: + possible_tokens[ngram[-1]] = self._n_gram_frequencies[ngram] + + return possible_tokens def _extract_n_grams( self, encoded_corpus: tuple[int, ...] @@ -333,6 +363,15 @@ def _extract_n_grams( In case of corrupt input arguments, None is returned """ + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + return None + + n_grams = [] + for i in range(len(encoded_corpus) - self._n_gram_size + 1): + ngram = tuple(encoded_corpus[i: i + self._n_gram_size]) + n_grams.append(ngram) + + return tuple(n_grams) class GreedyTextGenerator: @@ -352,6 +391,8 @@ def __init__(self, language_model: NGramLanguageModel, text_processor: TextProce language_model (NGramLanguageModel): A language model to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ + self._model = language_model + self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: """ @@ -367,6 +408,36 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0: + return None + + encoded_prompt = self._text_processor.encode(prompt) + if encoded_prompt is None: + return None + ngram_size = self._model.get_n_gram_size() + + text = prompt + + for i in range(seq_len): + tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:]) + if tokens is None: + break + max_freq = max(tokens.values()) + max_candidates = [] + for candidate, freq in tokens.items(): + if freq == max_freq: + max_candidates.append(candidate) + encoded_prompt = encoded_prompt + (sorted(max_candidates)[0]) + best_candidate =self._text_processor.get_token(encoded_prompt[-1]) + if best_candidate is None: + return None + text += best_candidate + + decoded_prompt = self._text_processor.decode(encoded_prompt) + if decoded_prompt is None: + return None + + return decoded_prompt class BeamSearcher: diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index d51d8fb59..7c0f5dedd 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -16,7 +16,10 @@ def main() -> None: text_processor = main_py.TextProcessor('_') encoded_text = text_processor.encode(text) decoded_text = text_processor.decode(encoded_text) - result = decoded_text + language_model = main_py.NGramLanguageModel(encoded_text, 7) + greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + generated_text = greedy_generator.run(51, 'Vernon') + result = generated_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index b8626c4cf..1e8b31496 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -4 +6 From f47f868cbea42392c510e0920c915917b75ad5b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 19 Nov 2023 18:37:25 +0300 Subject: [PATCH 40/68] trying to fix and fix and fix....... --- lab_3_generate_by_ngrams/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index b7b5e99e6..400b81136 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -421,6 +421,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: for i in range(seq_len): tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:]) if tokens is None: + return prompt + "." + if len(tokens) == 0: break max_freq = max(tokens.values()) max_candidates = [] @@ -428,12 +430,12 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: if freq == max_freq: max_candidates.append(candidate) encoded_prompt = encoded_prompt + (sorted(max_candidates)[0]) - best_candidate =self._text_processor.get_token(encoded_prompt[-1]) + best_candidate = self._text_processor.get_token(encoded_prompt[-1]) if best_candidate is None: return None text += best_candidate - decoded_prompt = self._text_processor.decode(encoded_prompt) + decoded_prompt = self._text_processor.decode(encoded_prompt) + "." if decoded_prompt is None: return None From aded7e1926f93da605d0e89e3b1ed786b108a383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 21 Nov 2023 22:53:25 +0300 Subject: [PATCH 41/68] fixed comments and tests (except filter im trying to deal with it.....) --- lab_3_generate_by_ngrams/main.py | 26 ++++++++++++-------------- lab_3_generate_by_ngrams/start.py | 14 ++++++++------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 400b81136..8d6a22f2b 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -136,8 +136,7 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: for token in tokenized_text: if self.get_id(token) is None: return None - else: - encoded_corpus.append(self.get_id(token)) + encoded_corpus.append(self.get_id(token)) return tuple(encoded_corpus) @@ -151,11 +150,10 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ - if not isinstance(element, str) or len(element) != 1: + if not isinstance(element, str) or len(element) != 1 or element in self._storage: return None - if element not in self._storage: - self._storage[element] = len(self._storage) + self._storage[element] = len(self._storage) return None @@ -217,9 +215,10 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: for ident in corpus: if not isinstance(ident, int): return None - if self.get_token(ident) is None: + new_token = self.get_token(ident) + if new_token is None: return None - decoded_corpus.append(self.get_token(ident)) + decoded_corpus.append(new_token) return tuple(decoded_corpus) @@ -247,12 +246,11 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional resulting_text += token.upper() elif token == self._end_of_word_token: if index == len(decoded_corpus) - 1: - resulting_text += "." + resulting_text = f"{resulting_text}." else: - resulting_text += " " + resulting_text = f"{resulting_text} " else: resulting_text += token - # resulting_text.replace(resulting_text[-1], ".") return resulting_text @@ -320,7 +318,7 @@ def build(self) -> int: if not isinstance(ngram, tuple): return 1 p_w_1_2 = n_grams.count(ngram) - p_w_1 = len([context for context in n_grams if context[:-1] == ngram[:-1]]) + p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1]) self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1 return 0 @@ -421,7 +419,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: for i in range(seq_len): tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:]) if tokens is None: - return prompt + "." + return f"{prompt}." if len(tokens) == 0: break max_freq = max(tokens.values()) @@ -429,13 +427,13 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: for candidate, freq in tokens.items(): if freq == max_freq: max_candidates.append(candidate) - encoded_prompt = encoded_prompt + (sorted(max_candidates)[0]) + encoded_prompt = encoded_prompt + (sorted(max_candidates)[0],) best_candidate = self._text_processor.get_token(encoded_prompt[-1]) if best_candidate is None: return None text += best_candidate - decoded_prompt = self._text_processor.decode(encoded_prompt) + "." + decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}." if decoded_prompt is None: return None diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 7c0f5dedd..f7498dc7f 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -13,12 +13,14 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - text_processor = main_py.TextProcessor('_') - encoded_text = text_processor.encode(text) - decoded_text = text_processor.decode(encoded_text) - language_model = main_py.NGramLanguageModel(encoded_text, 7) - greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) - generated_text = greedy_generator.run(51, 'Vernon') + text_processor = main_py.TextProcessor('_') + encoded_text = text_processor.encode(text) + decoded_text = text_processor.decode(encoded_text) + + language_model = main_py.NGramLanguageModel(encoded_text, 7) + greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + generated_text = greedy_generator.run(51, 'Vernon') + result = generated_text assert result From 455e3fdba2adcd047d7a8ff6a4d5c6b03fd172ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 21 Nov 2023 23:53:20 +0300 Subject: [PATCH 42/68] filter (finally!!) (im not sure but..) --- lab_3_generate_by_ngrams/main.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 8d6a22f2b..138e3886d 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -102,9 +102,11 @@ def get_token(self, element_id: int) -> Optional[str]: if not isinstance(element_id, int) or element_id not in self._storage.values(): return None - for token, ident in self._storage.items(): - if element_id == ident: - return token + items = list(filter(lambda x: x[1] == element_id, self._storage.items())) + return items[0][0] + # for token, ident in self._storage.items(): + # if element_id == ident: + # return token def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -457,6 +459,8 @@ def __init__(self, beam_width: int, language_model: NGramLanguageModel) -> None: beam_width (int): Number of candidates to consider at each step language_model (NGramLanguageModel): A language model to use for next token prediction """ + self._beam_width = beam_width + self._model = language_model def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, float]]]: """ @@ -477,6 +481,16 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, In case of corrupt input arguments or methods used return None. """ + if not isinstance(sequence, tuple) or len(sequence) == 0: + return None + + generated_dict = self._model.generate_next_token(sequence) + if generated_dict is None: + return None + if not generated_dict: + return [] + + def continue_sequence( self, From ca0b2789cd0018193a175ea3f2715a8e42e42c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Tue, 21 Nov 2023 23:53:31 +0300 Subject: [PATCH 43/68] filter (finally!!) (im not sure but..) --- lab_3_generate_by_ngrams/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 138e3886d..164f9d776 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -104,9 +104,6 @@ def get_token(self, element_id: int) -> Optional[str]: items = list(filter(lambda x: x[1] == element_id, self._storage.items())) return items[0][0] - # for token, ident in self._storage.items(): - # if element_id == ident: - # return token def encode(self, text: str) -> Optional[tuple[int, ...]]: """ From 36ea7e86ed39bd12f5ac804b72f8a768958dadf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 18:47:00 +0300 Subject: [PATCH 44/68] code for 8 --- lab_3_generate_by_ngrams/main.py | 68 +++++++++++++++++++++++ lab_3_generate_by_ngrams/start.py | 5 +- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 164f9d776..b2b918038 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -5,6 +5,7 @@ """ # pylint:disable=too-few-public-methods from typing import Optional +import math class TextProcessor: @@ -487,6 +488,8 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, if not generated_dict: return [] + return sorted([(token, freq) for token, freq in generated_dict.items()], + key=lambda pair: pair[1], reverse=True)[:self._beam_width] def continue_sequence( @@ -510,6 +513,19 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ + if (not isinstance(sequence,tuple) or len(sequence) == 0 or + not isinstance(next_tokens, list) or len(next_tokens) == 0 or + not isinstance(sequence_candidates, dict) or not sequence_candidates or + len(next_tokens) >= self._beam_width or + sequence not in sequence_candidates): + return None + + result_dict_cand = sequence_candidates.copy() + for token, freq in next_tokens: + result_dict_cand[sequence + (token,)] = freq - math.log(freq) + + return result_dict_cand + def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] @@ -525,6 +541,13 @@ def prune_sequence_candidates( In case of corrupt input arguments return None. """ + if not isinstance(sequence_candidates, dict) or not sequence_candidates: + return None + + sorted_sequences = sorted(sequence_candidates.items(), key=lambda item: item[1], reverse=True) + + return dict(sorted_sequences[:self._beam_width]) + class BeamSearchTextGenerator: @@ -552,6 +575,10 @@ def __init__( text_processor (TextProcessor): A TextProcessor instance to handle text processing beam_width (int): Beam width parameter for generation """ + self._language_model = language_model + self._text_processor = text_processor + self._beam_width = beam_width + self._beam_searchers = BeamSearcher(self._beam_width, self._language_model) def run(self, prompt: str, seq_len: int) -> Optional[str]: """ @@ -567,6 +594,39 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if (not isinstance(prompt, str) or len(prompt) == 0 or + not isinstance(seq_len, int) or seq_len < 0): + return None + + encoded_prompt = self._text_processor.encode(prompt) + if encoded_prompt is None: + return None + candidates = {encoded_prompt: 0.0} + + for i in range(seq_len): + new_candidates = candidates.copy() + for sequence in candidates: + next_tokens = self._get_next_token(sequence) + if next_tokens is None: + return None + + continued_sentence = ( + self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates) + ) + if continued_sentence is None: + break + + best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates) + if best_sequence is None: + return None + candidates = best_sequence + + decoded_result = self._text_processor.decode(sorted(tuple(candidates), key=lambda item: item[1])[0]) + + return decoded_result + + + def _get_next_token( self, sequence_to_continue: tuple[int, ...] @@ -583,6 +643,14 @@ def _get_next_token( In case of corrupt input arguments return None. """ + if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0: + return None + + next_token = self._beam_searchers.get_next_token(sequence_to_continue) + if next_token is None: + return None + + return next_token class NGramLanguageModelReader: diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index f7498dc7f..f9973a58f 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -21,7 +21,10 @@ def main() -> None: greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) generated_text = greedy_generator.run(51, 'Vernon') - result = generated_text + beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7) + resulted_text = beam_search_generator.run('Vernon', 56) + + result = resulted_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -6 +8 From c8f555386eaf65b9d5473f26be6e3d0ab191004b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 18:58:15 +0300 Subject: [PATCH 45/68] code style fixing --- lab_3_generate_by_ngrams/main.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index b2b918038..15103e705 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -319,7 +319,7 @@ def build(self) -> int: return 1 p_w_1_2 = n_grams.count(ngram) p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1]) - self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1 + self._n_gram_frequencies[ngram] = p_w_1_2 / p_w_1 return 0 @@ -335,15 +335,16 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ - if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1: + if (not isinstance(sequence, tuple) or len(sequence) == 0 + or len(sequence) < self._n_gram_size - 1): return None possible_tokens = {} context = sequence[-(self._n_gram_size - 1)::] - for ngram in self._n_gram_frequencies: + for ngram, freq in self._n_gram_frequencies.items(): if ngram[:self._n_gram_size - 1] == context: - possible_tokens[ngram[-1]] = self._n_gram_frequencies[ngram] + possible_tokens[ngram[-1]] = freq return possible_tokens @@ -417,7 +418,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: text = prompt for i in range(seq_len): - tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:]) + tokens = self._model.generate_next_token(encoded_prompt[-ngram_size + 1:]) if tokens is None: return f"{prompt}." if len(tokens) == 0: @@ -488,7 +489,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, if not generated_dict: return [] - return sorted([(token, freq) for token, freq in generated_dict.items()], + return sorted(list((token, freq) for token, freq in generated_dict.items()), key=lambda pair: pair[1], reverse=True)[:self._beam_width] @@ -513,11 +514,12 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ - if (not isinstance(sequence,tuple) or len(sequence) == 0 or + if (not isinstance(sequence, tuple) or len(sequence) == 0 or not isinstance(next_tokens, list) or len(next_tokens) == 0 or - not isinstance(sequence_candidates, dict) or not sequence_candidates or - len(next_tokens) >= self._beam_width or - sequence not in sequence_candidates): + not isinstance(sequence_candidates, dict) or not sequence_candidates): + return None + if (len(next_tokens) >= self._beam_width or + sequence not in sequence_candidates): return None result_dict_cand = sequence_candidates.copy() @@ -544,7 +546,8 @@ def prune_sequence_candidates( if not isinstance(sequence_candidates, dict) or not sequence_candidates: return None - sorted_sequences = sorted(sequence_candidates.items(), key=lambda item: item[1], reverse=True) + sorted_sequences = sorted(sequence_candidates.items(), + key=lambda item: item[1], reverse=True) return dict(sorted_sequences[:self._beam_width]) @@ -621,7 +624,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: return None candidates = best_sequence - decoded_result = self._text_processor.decode(sorted(tuple(candidates), key=lambda item: item[1])[0]) + decoded_result = self._text_processor.decode(sorted(tuple(candidates), + key=lambda item: item[1])[0]) return decoded_result From 8e61901253a3ce562f4954b2e28f66c9b38a22ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 19:03:38 +0300 Subject: [PATCH 46/68] code style and import fixing --- lab_3_generate_by_ngrams/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 15103e705..452471597 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -5,6 +5,7 @@ """ # pylint:disable=too-few-public-methods from typing import Optional + import math @@ -515,8 +516,10 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ if (not isinstance(sequence, tuple) or len(sequence) == 0 or - not isinstance(next_tokens, list) or len(next_tokens) == 0 or - not isinstance(sequence_candidates, dict) or not sequence_candidates): + not isinstance(next_tokens, list)): + return None + if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or + not sequence_candidates): return None if (len(next_tokens) >= self._beam_width or sequence not in sequence_candidates): From a4fd670ef1832dc684ae48f3f56453e838cc7ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 19:06:31 +0300 Subject: [PATCH 47/68] import style fixing --- lab_3_generate_by_ngrams/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 452471597..43786dd4a 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -4,9 +4,8 @@ Beam-search and natural language generation evaluation """ # pylint:disable=too-few-public-methods -from typing import Optional - import math +from typing import Optional class TextProcessor: From 79520f1ce28580125fa89ca3d18d60f91e3ca7cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 22:48:40 +0300 Subject: [PATCH 48/68] mypy fixing --- lab_3_generate_by_ngrams/main.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 43786dd4a..da0db7d5c 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -520,15 +520,16 @@ def continue_sequence( if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or not sequence_candidates): return None - if (len(next_tokens) >= self._beam_width or + if (len(next_tokens) > self._beam_width or sequence not in sequence_candidates): return None - result_dict_cand = sequence_candidates.copy() - for token, freq in next_tokens: - result_dict_cand[sequence + (token,)] = freq - math.log(freq) + for (token, freq) in next_tokens: + sequence_candidates[sequence + (token,)] = \ + sequence_candidates[sequence] - math.log(freq) + sequence_candidates.pop(sequence) - return result_dict_cand + return sequence_candidates def prune_sequence_candidates( @@ -585,6 +586,8 @@ def __init__( self._beam_width = beam_width self._beam_searchers = BeamSearcher(self._beam_width, self._language_model) + return None + def run(self, prompt: str, seq_len: int) -> Optional[str]: """ Generate sequence based on NGram language model and prompt provided. @@ -600,7 +603,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: None is returned """ if (not isinstance(prompt, str) or len(prompt) == 0 or - not isinstance(seq_len, int) or seq_len < 0): + not isinstance(seq_len, int) or seq_len < 0): return None encoded_prompt = self._text_processor.encode(prompt) @@ -609,7 +612,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: candidates = {encoded_prompt: 0.0} for i in range(seq_len): - new_candidates = candidates.copy() + new_candidates = dict(candidates) for sequence in candidates: next_tokens = self._get_next_token(sequence) if next_tokens is None: From 6fe7fdcd1474369e410037c769aa8132335ccb71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 22:52:20 +0300 Subject: [PATCH 49/68] i dont understand.... --- lab_3_generate_by_ngrams/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index da0db7d5c..41159d086 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -586,8 +586,6 @@ def __init__( self._beam_width = beam_width self._beam_searchers = BeamSearcher(self._beam_width, self._language_model) - return None - def run(self, prompt: str, seq_len: int) -> Optional[str]: """ Generate sequence based on NGram language model and prompt provided. From 950d83b000a9d38220a72f5c929e611aedf12040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 23:07:43 +0300 Subject: [PATCH 50/68] mypy fixing --- lab_3_generate_by_ngrams/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 41159d086..53619e38b 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -134,9 +134,10 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: encoded_corpus = [] for token in tokenized_text: - if self.get_id(token) is None: + ident = self.get_id(token) + if ident is None: return None - encoded_corpus.append(self.get_id(token)) + encoded_corpus.append(ident) return tuple(encoded_corpus) From 51d425fa62712d9c71199471a792dac2ad4bef70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 23:13:58 +0300 Subject: [PATCH 51/68] mypy fixing --- lab_3_generate_by_ngrams/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 53619e38b..27ea6ddd8 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -490,8 +490,13 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, if not generated_dict: return [] - return sorted(list((token, freq) for token, freq in generated_dict.items()), - key=lambda pair: pair[1], reverse=True)[:self._beam_width] + list_of_token_pairs = [] + for token, freq in generated_dict.items(): + token_pair = (token, float(freq)) + list_of_token_pairs.append(token_pair) + best = sorted(list_of_token_pairs, key=lambda x: x[1], reverse=True)[:self._beam_width] + + return best def continue_sequence( From a114cdc50680ba8fd9e937dbe82266ab639f2d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Wed, 22 Nov 2023 23:20:09 +0300 Subject: [PATCH 52/68] mypy fixing --- lab_3_generate_by_ngrams/start.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index f9973a58f..d4c1b7945 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -15,17 +15,18 @@ def main() -> None: text = text_file.read() text_processor = main_py.TextProcessor('_') encoded_text = text_processor.encode(text) - decoded_text = text_processor.decode(encoded_text) + if isinstance(encoded_text, tuple) and encoded_text is not None: + decoded_text = text_processor.decode(encoded_text) - language_model = main_py.NGramLanguageModel(encoded_text, 7) - greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) - generated_text = greedy_generator.run(51, 'Vernon') + language_model = main_py.NGramLanguageModel(encoded_text, 7) + greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + generated_text = greedy_generator.run(51, 'Vernon') - beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7) - resulted_text = beam_search_generator.run('Vernon', 56) + beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7) + resulted_text = beam_search_generator.run('Vernon', 56) - result = resulted_text - assert result + result = resulted_text + assert result if __name__ == "__main__": From 6948e044d254c4c6fe6f49face858ed1ce7c05cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 23 Nov 2023 17:07:46 +0300 Subject: [PATCH 53/68] mypy fixing --- lab_3_generate_by_ngrams/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 27ea6ddd8..f7542eb78 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -626,7 +626,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates) ) if continued_sentence is None: - break + return self._text_processor.decode(sorted(tuple(candidates), + key=lambda pair: pair[1])[0]) best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates) if best_sequence is None: From 908cacbe8e922776588d8b2ba3fdcef617286b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 23 Nov 2023 23:09:21 +0300 Subject: [PATCH 54/68] fixing all --- lab_3_generate_by_ngrams/main.py | 76 ++++++++++++------------------- lab_3_generate_by_ngrams/start.py | 3 ++ 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index f7542eb78..d9fcf6dbb 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -56,7 +56,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: if not text[-1].isalnum(): tokenized_text.append(self._end_of_word_token) - if len(tokenized_text) == 0: + if not tokenized_text: return None return tuple(tokenized_text) @@ -158,7 +158,6 @@ def _put(self, element: str) -> None: return None - def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ Decode and postprocess encoded corpus by converting integer identifiers to string. @@ -241,22 +240,13 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: return None - resulting_text = "" - for index, token in enumerate(decoded_corpus): - if index == 0: - resulting_text += token.upper() - elif token == self._end_of_word_token: - if index == len(decoded_corpus) - 1: - resulting_text = f"{resulting_text}." - else: - resulting_text = f"{resulting_text} " - else: - resulting_text += token + text_string = ''.join(list(decoded_corpus)) + resulting_text = text_string.replace(self._end_of_word_token, ' ') + resulting_text = f"{resulting_text.capitalize().strip()}." return resulting_text - class NGramLanguageModel: """ Store language model by n_grams, predict the next token. @@ -336,15 +326,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ - if (not isinstance(sequence, tuple) or len(sequence) == 0 - or len(sequence) < self._n_gram_size - 1): + if not (isinstance(sequence, tuple) and sequence + and len(sequence) >= self._n_gram_size - 1): return None possible_tokens = {} - context = sequence[-(self._n_gram_size - 1)::] + context = sequence[-(self._n_gram_size - 1):] for ngram, freq in self._n_gram_frequencies.items(): - if ngram[:self._n_gram_size - 1] == context: + if ngram[:- 1] == context: possible_tokens[ngram[-1]] = freq return possible_tokens @@ -435,7 +425,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: return None text += best_candidate - decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}." + decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}" if decoded_prompt is None: return None @@ -498,7 +488,6 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, return best - def continue_sequence( self, sequence: tuple[int, ...], @@ -537,7 +526,6 @@ def continue_sequence( return sequence_candidates - def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] ) -> Optional[dict[tuple[int, ...], float]]: @@ -555,13 +543,12 @@ def prune_sequence_candidates( if not isinstance(sequence_candidates, dict) or not sequence_candidates: return None - sorted_sequences = sorted(sequence_candidates.items(), - key=lambda item: item[1], reverse=True) + sorted_sequences = sorted(list(sequence_candidates.items()), + key=lambda item: item[1]) return dict(sorted_sequences[:self._beam_width]) - class BeamSearchTextGenerator: """ Class for text generation with BeamSearch. @@ -590,7 +577,7 @@ def __init__( self._language_model = language_model self._text_processor = text_processor self._beam_width = beam_width - self._beam_searchers = BeamSearcher(self._beam_width, self._language_model) + self.beam_searcher = BeamSearcher(self._beam_width, self._language_model) def run(self, prompt: str, seq_len: int) -> Optional[str]: """ @@ -606,12 +593,12 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if (not isinstance(prompt, str) or len(prompt) == 0 or - not isinstance(seq_len, int) or seq_len < 0): + if not (isinstance(prompt, str) and prompt + and isinstance(seq_len, int) and seq_len): return None encoded_prompt = self._text_processor.encode(prompt) - if encoded_prompt is None: + if not encoded_prompt: return None candidates = {encoded_prompt: 0.0} @@ -619,28 +606,24 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: new_candidates = dict(candidates) for sequence in candidates: next_tokens = self._get_next_token(sequence) - if next_tokens is None: + if not next_tokens: return None - continued_sentence = ( - self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates) + continued_sequence = ( + self.beam_searcher.continue_sequence(sequence, next_tokens, new_candidates) ) - if continued_sentence is None: - return self._text_processor.decode(sorted(tuple(candidates), - key=lambda pair: pair[1])[0]) - - best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates) - if best_sequence is None: - return None - candidates = best_sequence - - decoded_result = self._text_processor.decode(sorted(tuple(candidates), - key=lambda item: item[1])[0]) - - return decoded_result + if not continued_sequence: + break + best_sequence = self.beam_searcher.prune_sequence_candidates(new_candidates) + if best_sequence is None: + return None + candidates = best_sequence + decoded_result = self._text_processor.decode(sorted(tuple(candidates), + key=lambda item: item[1])[0]) + return decoded_result def _get_next_token( self, sequence_to_continue: tuple[int, ...] @@ -657,10 +640,11 @@ def _get_next_token( In case of corrupt input arguments return None. """ - if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0: + if not(isinstance(sequence_to_continue, tuple) + and sequence_to_continue): return None - next_token = self._beam_searchers.get_next_token(sequence_to_continue) + next_token = self.beam_searcher.get_next_token(sequence_to_continue) if next_token is None: return None diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index d4c1b7945..54f944a3a 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -17,13 +17,16 @@ def main() -> None: encoded_text = text_processor.encode(text) if isinstance(encoded_text, tuple) and encoded_text is not None: decoded_text = text_processor.decode(encoded_text) + print(decoded_text) language_model = main_py.NGramLanguageModel(encoded_text, 7) greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) generated_text = greedy_generator.run(51, 'Vernon') + print(generated_text) beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7) resulted_text = beam_search_generator.run('Vernon', 56) + print(resulted_text) result = resulted_text assert result From 4be43b7423fa9cbf361b6c887332e0cb75b77b25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Thu, 23 Nov 2023 23:38:39 +0300 Subject: [PATCH 55/68] fixing start --- lab_3_generate_by_ngrams/main.py | 21 +++++++++------------ lab_3_generate_by_ngrams/start.py | 14 +++++++++----- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index d9fcf6dbb..4e58402b2 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -306,10 +306,9 @@ def build(self) -> int: return 1 for ngram in set(n_grams): - if not isinstance(ngram, tuple): - return 1 p_w_1_2 = n_grams.count(ngram) - p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1]) + p_w_1 = len([context for context in n_grams + if context[:-1] == ngram[:-1]]) self._n_gram_frequencies[ngram] = p_w_1_2 / p_w_1 return 0 @@ -471,7 +470,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, In case of corrupt input arguments or methods used return None. """ - if not isinstance(sequence, tuple) or len(sequence) == 0: + if not (isinstance(sequence, tuple) and sequence): return None generated_dict = self._model.generate_next_token(sequence) @@ -509,14 +508,12 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ - if (not isinstance(sequence, tuple) or len(sequence) == 0 or - not isinstance(next_tokens, list)): - return None - if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or - not sequence_candidates): - return None - if (len(next_tokens) > self._beam_width or - sequence not in sequence_candidates): + if not (isinstance(sequence, tuple) and sequence + and isinstance(next_tokens, list) and next_tokens + and isinstance(sequence_candidates, dict) + and sequence_candidates + and len(next_tokens) <= self._beam_width + and sequence in sequence_candidates): return None for (token, freq) in next_tokens: diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 54f944a3a..4bb5684c9 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -15,20 +15,24 @@ def main() -> None: text = text_file.read() text_processor = main_py.TextProcessor('_') encoded_text = text_processor.encode(text) - if isinstance(encoded_text, tuple) and encoded_text is not None: + if isinstance(encoded_text, tuple) and encoded_text: decoded_text = text_processor.decode(encoded_text) print(decoded_text) - language_model = main_py.NGramLanguageModel(encoded_text, 7) - greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + language_model = main_py.NGramLanguageModel(encoded_text[:100], 3) + n_grams = language_model.build() + print(n_grams) + + lang_model2 = main_py.NGramLanguageModel(encoded_text, 7) + greedy_generator = main_py.GreedyTextGenerator(lang_model2, text_processor) generated_text = greedy_generator.run(51, 'Vernon') print(generated_text) - beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7) + beam_search_generator = main_py.BeamSearchTextGenerator(lang_model2, text_processor, 7) resulted_text = beam_search_generator.run('Vernon', 56) print(resulted_text) - result = resulted_text + result = decoded_text assert result From d1bc24d6696a2afea91d079e57fe5e6b8b8d1f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 24 Nov 2023 11:44:49 +0300 Subject: [PATCH 56/68] fixing return --- lab_3_generate_by_ngrams/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 4e58402b2..71f2fc76b 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -156,8 +156,6 @@ def _put(self, element: str) -> None: self._storage[element] = len(self._storage) - return None - def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ Decode and postprocess encoded corpus by converting integer identifiers to string. From 43e0cea47ffb5c28e2ed263cd1b3d798f6d47d58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Fri, 24 Nov 2023 11:49:00 +0300 Subject: [PATCH 57/68] fixing return --- lab_3_generate_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 71f2fc76b..501cd6d3e 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -152,7 +152,7 @@ def _put(self, element: str) -> None: an element is not added to storage """ if not isinstance(element, str) or len(element) != 1 or element in self._storage: - return None + return self._storage[element] = len(self._storage) From 26c5222e1308fe87d7fc918e05d3a04058ce178b Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 24 Nov 2023 18:37:37 +0300 Subject: [PATCH 58/68] checkout labs from the origin repository --- lab_3_generate_by_ngrams/start.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index b7612718d..a4ec25e0f 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -4,8 +4,6 @@ from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) -import lab_3_generate_by_ngrams.main as main_py - def main() -> None: """ From 2c8daca46a97ffd3cfb37b473878f5b534912bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:42:04 +0300 Subject: [PATCH 59/68] code for 6 --- lab_4_fill_words_by_ngrams/main.py | 65 +++++++++++++++++++++ lab_4_fill_words_by_ngrams/start.py | 13 ++++- lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index b739ae182..2873b8731 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -6,6 +6,7 @@ # pylint:disable=too-few-public-methods, too-many-arguments from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) +import random class WordProcessor(TextProcessor): @@ -28,6 +29,17 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(text, str) or not text: + raise ValueError('Type input is inappropriate or input argument is empty.') + + preprocessed_text = "" + for element in text.lower(): + if element in "?!.": + preprocessed_text += f" {self.get_end_of_word_token()}" + elif element.isalpha() or element.isspace(): + preprocessed_text += element + + return tuple(preprocessed_text.split(" ")) def _put(self, element: str) -> None: """ @@ -39,6 +51,11 @@ def _put(self, element: str) -> None: Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(element, str) or not element: + raise ValueError('Type input is inappropriate or input argument is empty.') + + if element not in self._storage: + self._storage[element] = len(self._storage) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore """ @@ -56,6 +73,16 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(decoded_corpus, tuple) or not decoded_corpus: + raise ValueError('Type input is inappropriate or input argument is empty.') + + words = "".join(decoded_corpus) + sentences = words.split(self._end_of_word_token) + resulted_text = ". ".join([sentence.strip().capitalize() for sentence in sentences]) + + if resulted_text[-1] == ' ': + return resulted_text[:-1] + return f"{resulted_text}." class TopPGenerator: @@ -80,6 +107,9 @@ def __init__( word_processor (WordProcessor): WordProcessor instance to handle text processing p_value (float): Collective probability mass threshold """ + self._model = language_model + self._word_processor = word_processor + self._p_value = p_value def run(self, seq_len: int, prompt: str) -> str: # type: ignore """ @@ -98,6 +128,41 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore or if sequence has inappropriate length, or if methods used return None. """ + if not (isinstance(seq_len, int) and isinstance(prompt, str) and + seq_len > 0 and prompt): + raise ValueError('Type input is inappropriate or input argument is empty.') + + encoded_prompt = self._word_processor.encode(prompt) + if encoded_prompt is None: + raise ValueError('None is returned') + + encoded_list = list(encoded_prompt) + for i in range(seq_len): + candidates = self._model.generate_next_token(encoded_prompt) + if candidates is None: + raise ValueError('None is returned.') + if not candidates: + break + sorted_candidates = sorted(list(candidates.items()), + key=lambda pair: pair[1], reverse=True) + sum_freq = 0 + num_candidates = 0 + for candidate in sorted_candidates: + if sum_freq >= self._p_value: + break + sum_freq += candidate[1] + num_candidates += 1 + + random_token = random.choice(sorted_candidates[:num_candidates])[0] + encoded_list.append(random_token) + encoded_prompt = tuple(encoded_list) + + decoded = self._word_processor.decode(encoded_prompt) + if decoded is None: + raise ValueError('None is returned') + + return decoded + class GeneratorTypes: diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index c41386377..811ebcf84 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,6 +2,9 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import +from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator, + NGramLanguageModel, TopPGenerator, + QualityChecker, WordProcessor) def main() -> None: @@ -10,7 +13,15 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None + word_processor = WordProcessor("") + encoded_text = word_processor.encode(text) + lang_model = NGramLanguageModel(encoded_text, 2) + lang_model.build() + + top_p_generator = TopPGenerator(lang_model, word_processor, 0.5) + result = top_p_generator.run(51, "Vernon") + print(result) + assert result diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 573541ac9..1e8b31496 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +6 From 7b1f90d40811eb82fb749ef498d7129fc7749a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:45:35 +0300 Subject: [PATCH 60/68] import style fixing --- lab_4_fill_words_by_ngrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 2873b8731..a2d010b2a 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -6,7 +6,7 @@ # pylint:disable=too-few-public-methods, too-many-arguments from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) -import random +from random import choice class WordProcessor(TextProcessor): @@ -153,7 +153,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore sum_freq += candidate[1] num_candidates += 1 - random_token = random.choice(sorted_candidates[:num_candidates])[0] + random_token = choice(sorted_candidates[:num_candidates])[0] encoded_list.append(random_token) encoded_prompt = tuple(encoded_list) From 342556df79ad39d4c92a02bcd99eaedca3c9e3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:48:03 +0300 Subject: [PATCH 61/68] import style fixing --- lab_4_fill_words_by_ngrams/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index a2d010b2a..a0537d1f6 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -4,9 +4,10 @@ Top-p sampling generation and filling gaps with ngrams """ # pylint:disable=too-few-public-methods, too-many-arguments +from random import choice + from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) -from random import choice class WordProcessor(TextProcessor): From a421f39ca52371981ec37115e5c3d3947a869806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:51:19 +0300 Subject: [PATCH 62/68] import style fixing --- lab_4_fill_words_by_ngrams/start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 811ebcf84..1be68cd31 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,6 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import + from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator, NGramLanguageModel, TopPGenerator, QualityChecker, WordProcessor) From 621dcf54f606cc1313af6f521e05009b3881318f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:54:19 +0300 Subject: [PATCH 63/68] import style fixing --- lab_4_fill_words_by_ngrams/start.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 1be68cd31..ee9a97ddf 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,10 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import - -from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator, - NGramLanguageModel, TopPGenerator, - QualityChecker, WordProcessor) +from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor) def main() -> None: From a9a085decb10b73fd6dff827b2cf878b5ed53955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 21:58:50 +0300 Subject: [PATCH 64/68] import style fixing --- lab_4_fill_words_by_ngrams/start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index ee9a97ddf..f3f0d7721 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,6 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import + from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor) From 3000112b2540336f8e6ee5448232bb41a66cfa27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sun, 10 Dec 2023 22:07:29 +0300 Subject: [PATCH 65/68] import style and tests fixing --- lab_4_fill_words_by_ngrams/main.py | 4 ++-- lab_4_fill_words_by_ngrams/start.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index a0537d1f6..ee8de63e6 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -77,7 +77,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # if not isinstance(decoded_corpus, tuple) or not decoded_corpus: raise ValueError('Type input is inappropriate or input argument is empty.') - words = "".join(decoded_corpus) + words = " ".join(decoded_corpus) sentences = words.split(self._end_of_word_token) resulted_text = ". ".join([sentence.strip().capitalize() for sentence in sentences]) @@ -145,7 +145,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore if not candidates: break sorted_candidates = sorted(list(candidates.items()), - key=lambda pair: pair[1], reverse=True) + key=lambda pair: (float(pair[1]), pair[0]), reverse=True) sum_freq = 0 num_candidates = 0 for candidate in sorted_candidates: diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index f3f0d7721..ee9a97ddf 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,7 +2,6 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import - from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor) From 1aecdd0ef2bb86da5de70d2a7754913545f84aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sat, 16 Dec 2023 14:22:37 +0300 Subject: [PATCH 66/68] import style fixing --- lab_4_fill_words_by_ngrams/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index ee9a97ddf..a9d6d93ad 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,7 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor) +from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor def main() -> None: From 7daa0b5d1690211100873dc84721cbf55ecec460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Sat, 16 Dec 2023 15:39:58 +0300 Subject: [PATCH 67/68] tests fixing --- lab_4_fill_words_by_ngrams/main.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index ee8de63e6..d0397e8c1 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -33,14 +33,16 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore if not isinstance(text, str) or not text: raise ValueError('Type input is inappropriate or input argument is empty.') - preprocessed_text = "" - for element in text.lower(): - if element in "?!.": - preprocessed_text += f" {self.get_end_of_word_token()}" - elif element.isalpha() or element.isspace(): - preprocessed_text += element - - return tuple(preprocessed_text.split(" ")) + tokens = [] + punctuation_signs = '?!.' + for word in text.lower().split(): + cleaned_word = [letter for letter in word if letter.isalpha()] + if not cleaned_word: + continue + tokens.append(''.join(cleaned_word)) + if word[-1] in punctuation_signs: + tokens.append(self._end_of_word_token) + return tuple(tokens) def _put(self, element: str) -> None: """ @@ -145,7 +147,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore if not candidates: break sorted_candidates = sorted(list(candidates.items()), - key=lambda pair: (float(pair[1]), pair[0]), reverse=True) + key=lambda pair: (pair[1], pair[0]), reverse=True) sum_freq = 0 num_candidates = 0 for candidate in sorted_candidates: From ba55b5afb02a0e37592c586fc2a9d7264a1e4ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?= =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= Date: Mon, 18 Dec 2023 13:45:10 +0300 Subject: [PATCH 68/68] fixing --- lab_4_fill_words_by_ngrams/main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index d0397e8c1..1fe19ee9c 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -150,10 +150,10 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore key=lambda pair: (pair[1], pair[0]), reverse=True) sum_freq = 0 num_candidates = 0 - for candidate in sorted_candidates: + for _, freq in sorted_candidates: if sum_freq >= self._p_value: break - sum_freq += candidate[1] + sum_freq += freq num_candidates += 1 random_token = choice(sorted_candidates[:num_candidates])[0] @@ -167,7 +167,6 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore return decoded - class GeneratorTypes: """ A class that represents types of generators.