From 9a9950f8c61cb20d457b2304ad9088ee4ba4370f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 8 Sep 2023 12:15:38 +0300 Subject: [PATCH 001/107] created a new file --- main.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 000000000..ec7780cab --- /dev/null +++ b/main.py @@ -0,0 +1 @@ +print('Hello, world!') From e69286a2669289b7310c9830dbd3013f45436714 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 20 Sep 2023 18:36:05 +0300 Subject: [PATCH 002/107] created a new file --- lab_1_classify_by_unigrams/main.py | 19 ++++++++++++++++++- lab_1_classify_by_unigrams/start.py | 10 +++++++--- lab_1_classify_by_unigrams/target_score.txt | 2 +- seminars/practice_2_string.py | 10 +++++----- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 486b3d65c..2fefbd5d6 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -2,6 +2,7 @@ Lab 1 Language detection """ +from typing import List def tokenize(text: str) -> list[str] | None: @@ -11,7 +12,8 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - + tokens: list[str] = [t for t in text.lower() if t.isalpha()] + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ @@ -19,6 +21,17 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ + dict_tokens = {} + all_tokens = 0 + for token in tokens: + all_tokens += 1 + num_token = tokens.count(token) + dict_tokens[token] = num_token + dict_freq = {} + for key, value in dict_tokens.items(): + freq = value/all_tokens + dict_freq[key] = freq + return dict_freq def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: @@ -28,6 +41,10 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st :param text: a text :return: a dictionary with two keys – name, freq """ + lang_profile = {} + lang_profile['name'] = language + lang_profile['freq'] = calculate_frequencies(tokenize(text)) + return lang_profile def calculate_mse(predicted: list, actual: list) -> float | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index db7a1a904..b7b0053fa 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,7 +1,9 @@ """ Language detection starter """ - +from lab_1_classify_by_unigrams.main import tokenize +from lab_1_classify_by_unigrams.main import calculate_frequencies +from lab_1_classify_by_unigrams.main import create_language_profile def main() -> None: """ @@ -9,6 +11,9 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() + en_tokens = (tokenize(en_text)) + print(en_tokens) + print(create_language_profile('en',en_text)) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: @@ -16,6 +21,5 @@ def main() -> None: result = None assert result, "Detection result is None" - if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 573541ac9..1e8b31496 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -0 +6 diff --git a/seminars/practice_2_string.py b/seminars/practice_2_string.py index 62f254a84..b640c811f 100644 --- a/seminars/practice_2_string.py +++ b/seminars/practice_2_string.py @@ -53,13 +53,13 @@ def multiply_string(input_string: str, how_many: int) -> str: display the given string the number of times given in the `how_many`. """ # student realisation goes here - + print(input_string * how_many) # Function calls with expected result: -# multiply_string('Hi', 2) → 'HiHi' -# multiply_string('Hi', 3) → 'HiHiHi' -# multiply_string('Hi', 1) → 'Hi' -# multiply_string('Hi', 0) → '' +multiply_string('Hi', 2) # → 'HiHi' +multiply_string('Hi', 3) # → 'HiHiHi' +multiply_string('Hi', 1) # → 'Hi' +multiply_string('Hi', 0) # → '' # Task 2: From d308b37dd18ae73fbdc7e042fd12122fc037c5d1 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 22 Sep 2023 11:03:00 +0300 Subject: [PATCH 003/107] created a new file --- main.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 main.py diff --git a/main.py b/main.py deleted file mode 100644 index ec7780cab..000000000 --- a/main.py +++ /dev/null @@ -1 +0,0 @@ -print('Hello, world!') From d1d283bdc6bf3fcd24ed720c6af4ae40407642ca Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 28 Sep 2023 14:33:54 +0300 Subject: [PATCH 004/107] steps 4 and 5 done --- lab_1_classify_by_unigrams/main.py | 76 +++++++++++++++++++++++------ lab_1_classify_by_unigrams/start.py | 3 +- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 2fefbd5d6..cf09dde14 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -2,7 +2,6 @@ Lab 1 Language detection """ -from typing import List def tokenize(text: str) -> list[str] | None: @@ -12,8 +11,12 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - tokens: list[str] = [t for t in text.lower() if t.isalpha()] - return tokens + if isinstance(text, str) == True: + tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] + tokens.sort() + return tokens + else: + return None def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ @@ -21,17 +24,23 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ - dict_tokens = {} - all_tokens = 0 - for token in tokens: - all_tokens += 1 - num_token = tokens.count(token) - dict_tokens[token] = num_token - dict_freq = {} - for key, value in dict_tokens.items(): - freq = value/all_tokens - dict_freq[key] = freq - return dict_freq + if isinstance(tokens, list) == True: + for token in tokens: + if isinstance(token, str) == False: + return None + dict_tokens = {} + all_tokens = 0 + for token in tokens: + all_tokens += 1 + num_token = tokens.count(token) + dict_tokens[token] = num_token + dict_freq = {} + for key, value in dict_tokens.items(): + freq = value/all_tokens + dict_freq[key] = freq + return dict_freq + else: + return None def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: @@ -54,6 +63,14 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ + if isinstance(predicted, list) and isinstance(actual, list) == True and len(predicted) == len(actual): + sum_diff = 0 + for i in range(0, len(predicted)): + sum_diff += (actual[i] - predicted[i]) ** 2 + mse = sum_diff / len (predicted) + return mse + else: + return None def compare_profiles( @@ -66,6 +83,37 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ + if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict) == True: + if ('name' and 'freq' in unknown_profile) and ('name' and 'freq' in profile_to_compare) == True: + + predicted_tokens = profile_to_compare.get('freq') + actual_tokens = unknown_profile.get('freq') + + for key in predicted_tokens.keys(): + if key in actual_tokens == False: + actual_tokens[key] = 0 + + for key in actual_tokens.keys(): + if key in predicted_tokens == False: + predicted_tokens[key] = 0 + + predicted = [] + for value in predicted_tokens.values(): + predicted.append(value) + predicted.sort() + + actual = [] + for value in actual_tokens.values(): + actual.append(value) + actual.sort() + + mse = calculate_mse(predicted, actual) + return mse + else: + return None + else: + return None + def detect_language( diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index b7b0053fa..48e6d28ef 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -4,6 +4,7 @@ from lab_1_classify_by_unigrams.main import tokenize from lab_1_classify_by_unigrams.main import calculate_frequencies from lab_1_classify_by_unigrams.main import create_language_profile +from lab_1_classify_by_unigrams.main import calculate_mse def main() -> None: """ @@ -12,7 +13,7 @@ def main() -> None: with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() en_tokens = (tokenize(en_text)) - print(en_tokens) + #print(en_tokens) print(create_language_profile('en',en_text)) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() From b00f1ee8032ee647a9d923baac0f3e327cfee929 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 29 Sep 2023 11:27:33 +0300 Subject: [PATCH 005/107] tokenize() early return done --- lab_1_classify_by_unigrams/main.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index aec97408a..b7c546345 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -11,12 +11,11 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - if isinstance(text, str) == True: - tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] - tokens.sort() - return tokens - else: + if isinstance(text, str) == False: return None + tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] + tokens.sort() + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ From ff6464d88ef68fb4dc2c9c2247cfab391b922bcf Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 29 Sep 2023 14:36:47 +0300 Subject: [PATCH 006/107] early return done in other functions --- lab_1_classify_by_unigrams/main.py | 98 +++++++++++++++--------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b7c546345..b0c936085 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -14,7 +14,6 @@ def tokenize(text: str) -> list[str] | None: if isinstance(text, str) == False: return None tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] - tokens.sort() return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: @@ -23,23 +22,22 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ - if isinstance(tokens, list) == True: - for token in tokens: - if isinstance(token, str) == False: - return None - dict_tokens = {} - all_tokens = 0 - for token in tokens: - all_tokens += 1 - num_token = tokens.count(token) - dict_tokens[token] = num_token - dict_freq = {} - for key, value in dict_tokens.items(): - freq = value/all_tokens - dict_freq[key] = freq - return dict_freq - else: + if isinstance(tokens, list) == False: return None + for token in tokens: + if isinstance(token, str) == False: + return None + dict_tokens = {} + all_tokens = 0 + for token in tokens: + all_tokens += 1 + num_token = tokens.count(token) + dict_tokens[token] = num_token + dict_freq = {} + for key, value in dict_tokens.items(): + freq = value/all_tokens + dict_freq[key] = freq + return dict_freq def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: @@ -49,6 +47,8 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st :param text: a text :return: a dictionary with two keys – name, freq """ + if isinstance(language, str) == False or isinstance(text, str) == False: + return None lang_profile = {} lang_profile['name'] = language lang_profile['freq'] = calculate_frequencies(tokenize(text)) @@ -62,14 +62,13 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - if isinstance(predicted, list) and isinstance(actual, list) == True and len(predicted) == len(actual): - sum_diff = 0 - for i in range(0, len(predicted)): - sum_diff += (actual[i] - predicted[i]) ** 2 - mse = sum_diff / len (predicted) - return mse - else: + if isinstance(predicted, list) == False or isinstance(actual, list) == False or len(predicted) != len(actual): return None + sum_diff = 0 + for i in range(0, len(predicted)): + sum_diff += (actual[i] - predicted[i]) ** 2 + mse = sum_diff / len (predicted) + return mse def compare_profiles( @@ -82,36 +81,39 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ - if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict) == True: - if ('name' and 'freq' in unknown_profile) and ('name' and 'freq' in profile_to_compare) == True: + if isinstance(unknown_profile, dict) == False or isinstance(profile_to_compare, dict) == False: + return None + if ('name' or 'freq' not in unknown_profile) or ('name' or 'freq' not in profile_to_compare): + return None + #if len(unknown_profile) != len(profile_to_compare): + #return None - predicted_tokens = profile_to_compare.get('freq') - actual_tokens = unknown_profile.get('freq') + sorted_profile_to_compare = dict(sorted(profile_to_compare.items())) + sorted_unknown_profile = dict(sorted(unknown_profile.items())) - for key in predicted_tokens.keys(): - if key in actual_tokens == False: - actual_tokens[key] = 0 + predicted_tokens = sorted_profile_to_compare.get('freq') + actual_tokens = sorted_unknown_profile.get('freq') - for key in actual_tokens.keys(): - if key in predicted_tokens == False: - predicted_tokens[key] = 0 + for key in predicted_tokens.keys(): + if key not in actual_tokens: + actual_tokens[key] = 0 - predicted = [] - for value in predicted_tokens.values(): - predicted.append(value) - predicted.sort() + for key in actual_tokens.keys(): + if key not in predicted_tokens: + predicted_tokens[key] = 0 - actual = [] - for value in actual_tokens.values(): - actual.append(value) - actual.sort() + predicted = [] + for value in predicted_tokens.values(): + predicted.append(value) + + actual = [] + for value in actual_tokens.values(): + actual.append(value) + + mse = calculate_mse(predicted, actual) + mse = round(mse, 3) + return mse - mse = calculate_mse(predicted, actual) - return mse - else: - return None - else: - return None From 1b6aa1dcabac6bc3718ef69a01abf2c8b1ee96be Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 2 Oct 2023 17:05:58 +0300 Subject: [PATCH 007/107] steps 5-6 done --- lab_1_classify_by_unigrams/main.py | 20 +++++++++++++++++++- lab_1_classify_by_unigrams/start.py | 12 +++++++----- lab_1_classify_by_unigrams/target_score.txt | 2 +- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b0c936085..6dd40f306 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -83,7 +83,9 @@ def compare_profiles( """ if isinstance(unknown_profile, dict) == False or isinstance(profile_to_compare, dict) == False: return None - if ('name' or 'freq' not in unknown_profile) or ('name' or 'freq' not in profile_to_compare): + if 'name' not in unknown_profile or 'freq' not in unknown_profile: + return None + if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None #if len(unknown_profile) != len(profile_to_compare): #return None @@ -129,6 +131,22 @@ def detect_language( :param profile_2: a dictionary of a known profile :return: a language """ + if isinstance(unknown_profile, dict) == False: + return None + if isinstance(profile_1, dict) == False or isinstance(profile_2, dict) == False: + return None + mse_1 = compare_profiles(unknown_profile, profile_1) + mse_2 = compare_profiles(unknown_profile, profile_2) + if mse_1 == mse_2: + names = [] + names.append(profile_1['name']) + names.append(profile_2['name']) + names.sort() + return names[0] + elif mse_1 < mse_2: + return profile_1['name'] + else: + return profile_2['name'] def load_profile(path_to_file: str) -> dict | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 48e6d28ef..8135c3ac4 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,10 +1,8 @@ """ Language detection starter """ -from lab_1_classify_by_unigrams.main import tokenize -from lab_1_classify_by_unigrams.main import calculate_frequencies from lab_1_classify_by_unigrams.main import create_language_profile -from lab_1_classify_by_unigrams.main import calculate_mse +from lab_1_classify_by_unigrams.main import detect_language def main() -> None: """ @@ -12,15 +10,19 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_tokens = (tokenize(en_text)) + #en_tokens = (tokenize(en_text)) #print(en_tokens) - print(create_language_profile('en',en_text)) + en_profile = create_language_profile('en', en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() + de_profile = create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() + unknown_profile = create_language_profile('unknown', unknown_text) result = None assert result, "Detection result is None" + detection_result = detect_language(unknown_profile, en_profile, de_profile) + return detection_result if __name__ == "__main__": main() \ No newline at end of file diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -6 +8 From b05ca58e3526d3605e285a860be157a6cded3f95 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 2 Oct 2023 17:14:28 +0300 Subject: [PATCH 008/107] some small changes done --- lab_1_classify_by_unigrams/main.py | 7 ++++--- lab_1_classify_by_unigrams/start.py | 7 +++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 6dd40f306..f01d5187e 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -142,11 +142,12 @@ def detect_language( names.append(profile_1['name']) names.append(profile_2['name']) names.sort() - return names[0] + language = names[0] elif mse_1 < mse_2: - return profile_1['name'] + language = profile_1['name'] else: - return profile_2['name'] + language = profile_2['name'] + return language def load_profile(path_to_file: str) -> dict | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 8135c3ac4..f06561bf3 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,8 +1,9 @@ +from lab_1_classify_by_unigrams.main import create_language_profile +from lab_1_classify_by_unigrams.main import detect_language + """ Language detection starter """ -from lab_1_classify_by_unigrams.main import create_language_profile -from lab_1_classify_by_unigrams.main import detect_language def main() -> None: """ @@ -10,8 +11,6 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - #en_tokens = (tokenize(en_text)) - #print(en_tokens) en_profile = create_language_profile('en', en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() From 97692845864e894d54554ff80f75dcd668e2be1d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 2 Oct 2023 17:21:41 +0300 Subject: [PATCH 009/107] some small changes done --- lab_1_classify_by_unigrams/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index f01d5187e..ea5c352a9 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -87,8 +87,6 @@ def compare_profiles( return None if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None - #if len(unknown_profile) != len(profile_to_compare): - #return None sorted_profile_to_compare = dict(sorted(profile_to_compare.items())) sorted_unknown_profile = dict(sorted(unknown_profile.items())) From a2e3529d7f3cb395892b8494e5a8c632a690af37 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 10:34:30 +0300 Subject: [PATCH 010/107] some review points fixed and changes in practice_2_string.py reverted --- lab_1_classify_by_unigrams/main.py | 36 ++++++++++++++--------------- lab_1_classify_by_unigrams/start.py | 10 ++++---- seminars/practice_2_string.py | 10 ++++---- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index ea5c352a9..0c19e6c11 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -11,7 +11,7 @@ def tokenize(text: str) -> list[str] | None: :param text: a text :return: a list of lower-cased tokens without punctuation """ - if isinstance(text, str) == False: + if not isinstance(text, str): return None tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] return tokens @@ -22,10 +22,10 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: :param tokens: a list of tokens :return: a dictionary with frequencies """ - if isinstance(tokens, list) == False: + if not isinstance(tokens, list): return None for token in tokens: - if isinstance(token, str) == False: + if not isinstance(token, str): return None dict_tokens = {} all_tokens = 0 @@ -47,7 +47,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st :param text: a text :return: a dictionary with two keys – name, freq """ - if isinstance(language, str) == False or isinstance(text, str) == False: + if not isinstance(language, str) or not isinstance(text, str): return None lang_profile = {} lang_profile['name'] = language @@ -62,10 +62,10 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - if isinstance(predicted, list) == False or isinstance(actual, list) == False or len(predicted) != len(actual): + if not isinstance(predicted, list) or not isinstance(actual, list) or len(predicted) != len(actual): return None sum_diff = 0 - for i in range(0, len(predicted)): + for i in range(len(predicted)): sum_diff += (actual[i] - predicted[i]) ** 2 mse = sum_diff / len (predicted) return mse @@ -81,9 +81,9 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ - if isinstance(unknown_profile, dict) == False or isinstance(profile_to_compare, dict) == False: + if not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict): return None - if 'name' not in unknown_profile or 'freq' not in unknown_profile: + if 'name' not in unknown_profile or 'freq' not in unknown_profile: return None if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None @@ -102,13 +102,13 @@ def compare_profiles( if key not in predicted_tokens: predicted_tokens[key] = 0 - predicted = [] - for value in predicted_tokens.values(): - predicted.append(value) + predicted = list(predicted_tokens.values()) + #for value in predicted_tokens.values(): + #predicted.append(value) - actual = [] - for value in actual_tokens.values(): - actual.append(value) + actual = list(actual_tokens.values()) + #for value in actual_tokens.values(): + #actual.append(value) mse = calculate_mse(predicted, actual) mse = round(mse, 3) @@ -129,16 +129,14 @@ def detect_language( :param profile_2: a dictionary of a known profile :return: a language """ - if isinstance(unknown_profile, dict) == False: + if not isinstance(unknown_profile, dict): return None - if isinstance(profile_1, dict) == False or isinstance(profile_2, dict) == False: + if not isinstance(profile_1, dict) or not isinstance(profile_2, dict): return None mse_1 = compare_profiles(unknown_profile, profile_1) mse_2 = compare_profiles(unknown_profile, profile_2) if mse_1 == mse_2: - names = [] - names.append(profile_1['name']) - names.append(profile_2['name']) + names = [profile_1['name'], profile_2['name']] names.sort() language = names[0] elif mse_1 < mse_2: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index f06561bf3..28e45a8be 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,9 +1,9 @@ -from lab_1_classify_by_unigrams.main import create_language_profile -from lab_1_classify_by_unigrams.main import detect_language - """ Language detection starter """ +from lab_1_classify_by_unigrams.main import create_language_profile +from lab_1_classify_by_unigrams.main import detect_language + def main() -> None: """ @@ -20,8 +20,8 @@ def main() -> None: unknown_profile = create_language_profile('unknown', unknown_text) result = None assert result, "Detection result is None" - detection_result = detect_language(unknown_profile, en_profile, de_profile) - return detection_result + result = detect_language(unknown_profile, en_profile, de_profile) + return result if __name__ == "__main__": main() \ No newline at end of file diff --git a/seminars/practice_2_string.py b/seminars/practice_2_string.py index b640c811f..62f254a84 100644 --- a/seminars/practice_2_string.py +++ b/seminars/practice_2_string.py @@ -53,13 +53,13 @@ def multiply_string(input_string: str, how_many: int) -> str: display the given string the number of times given in the `how_many`. """ # student realisation goes here - print(input_string * how_many) + # Function calls with expected result: -multiply_string('Hi', 2) # → 'HiHi' -multiply_string('Hi', 3) # → 'HiHiHi' -multiply_string('Hi', 1) # → 'Hi' -multiply_string('Hi', 0) # → '' +# multiply_string('Hi', 2) → 'HiHi' +# multiply_string('Hi', 3) → 'HiHiHi' +# multiply_string('Hi', 1) → 'Hi' +# multiply_string('Hi', 0) → '' # Task 2: From a1d1944d841c23dedb084038e431a4724dfa5320 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 10:44:09 +0300 Subject: [PATCH 011/107] code style check fixed --- lab_1_classify_by_unigrams/main.py | 10 ++++++---- lab_1_classify_by_unigrams/start.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 0c19e6c11..a78f9e32e 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -62,12 +62,14 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - if not isinstance(predicted, list) or not isinstance(actual, list) or len(predicted) != len(actual): + if not isinstance(predicted, list) or not isinstance(actual, list): + return None + if len(predicted) != len(actual): return None sum_diff = 0 - for i in range(len(predicted)): - sum_diff += (actual[i] - predicted[i]) ** 2 - mse = sum_diff / len (predicted) + for index, value in enumerate(predicted): + sum_diff += (actual[index] - value) ** 2 + mse = sum_diff / len(predicted) return mse diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 28e45a8be..931f7fb68 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -24,4 +24,4 @@ def main() -> None: return result if __name__ == "__main__": - main() \ No newline at end of file + main() From bc922b4ba99c70e221ba8fe4c10725b424adea0e Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 10:55:06 +0300 Subject: [PATCH 012/107] import style checks fixed --- lab_1_classify_by_unigrams/start.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 931f7fb68..81fa58c32 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,10 +1,10 @@ -""" -Language detection starter -""" from lab_1_classify_by_unigrams.main import create_language_profile from lab_1_classify_by_unigrams.main import detect_language +""" +Language detection starter +""" def main() -> None: """ Launches an implementation From 400355b7506954d869d0ba1099dfdfd7cce87cf9 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 10:57:41 +0300 Subject: [PATCH 013/107] import style checks fixed --- lab_1_classify_by_unigrams/start.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 81fa58c32..931f7fb68 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,10 +1,10 @@ +""" +Language detection starter +""" from lab_1_classify_by_unigrams.main import create_language_profile from lab_1_classify_by_unigrams.main import detect_language -""" -Language detection starter -""" def main() -> None: """ Launches an implementation From f89e2f41159debb4e0dc17f5656255915019f514 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 22:33:37 +0300 Subject: [PATCH 014/107] a few changes done in calculate_mse and compare_profiles --- lab_1_classify_by_unigrams/main.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index a78f9e32e..c01a75655 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -70,6 +70,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for index, value in enumerate(predicted): sum_diff += (actual[index] - value) ** 2 mse = sum_diff / len(predicted) + mse = round(mse, 4) return mse @@ -90,11 +91,15 @@ def compare_profiles( if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None - sorted_profile_to_compare = dict(sorted(profile_to_compare.items())) - sorted_unknown_profile = dict(sorted(unknown_profile.items())) + #sorted_profile_to_compare + #predicted_tokens = dict(sorted(profile_to_compare['freq'].items())) + #sorted_unknown_profile + #actual_tokens = dict(sorted(unknown_profile['freq'].items())) + #sorted_profile_to_compare = tuple(sorted(profile_to_compare.items())) + #sorted_unknown_profile = tuple(sorted(unknown_profile.items())) - predicted_tokens = sorted_profile_to_compare.get('freq') - actual_tokens = sorted_unknown_profile.get('freq') + predicted_tokens = profile_to_compare.get('freq') + actual_tokens = unknown_profile.get('freq') for key in predicted_tokens.keys(): if key not in actual_tokens: @@ -104,16 +109,14 @@ def compare_profiles( if key not in predicted_tokens: predicted_tokens[key] = 0 - predicted = list(predicted_tokens.values()) - #for value in predicted_tokens.values(): - #predicted.append(value) + sorted_predicted_tokens = dict(sorted(predicted_tokens.items())) + sorted_actual_tokens = dict(sorted(actual_tokens.items())) - actual = list(actual_tokens.values()) - #for value in actual_tokens.values(): - #actual.append(value) + predicted = list(sorted_predicted_tokens.values()) + + actual = list(sorted_actual_tokens.values()) mse = calculate_mse(predicted, actual) - mse = round(mse, 3) return mse @@ -140,7 +143,7 @@ def detect_language( if mse_1 == mse_2: names = [profile_1['name'], profile_2['name']] names.sort() - language = names[0] + language = names[0] elif mse_1 < mse_2: language = profile_1['name'] else: From 4082ce597e75fccc77dfbb62e4459b08d8677ef6 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 22:45:13 +0300 Subject: [PATCH 015/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index c01a75655..a349f1aa5 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -49,9 +49,9 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - lang_profile = {} - lang_profile['name'] = language - lang_profile['freq'] = calculate_frequencies(tokenize(text)) + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + #freq = calculate_frequencies(tokenize(text)) + #lang_profile['freq'] = freq return lang_profile From fbce250b420847ca3043d0c342c48f6b2566ea6d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 22:47:58 +0300 Subject: [PATCH 016/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index a349f1aa5..ac5becf16 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -49,7 +49,8 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + freq = calculate_frequencies(tokenize(text)) + lang_profile = {'name': language, 'freq': freq} #freq = calculate_frequencies(tokenize(text)) #lang_profile['freq'] = freq return lang_profile From ce18afe3fdabe34bfd8946e55e26caae35b5cafc Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 23:00:11 +0300 Subject: [PATCH 017/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index ac5becf16..6a62ae4f5 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -49,11 +49,8 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - freq = calculate_frequencies(tokenize(text)) - lang_profile = {'name': language, 'freq': freq} - #freq = calculate_frequencies(tokenize(text)) - #lang_profile['freq'] = freq - return lang_profile + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + return lang_profile[:2] def calculate_mse(predicted: list, actual: list) -> float | None: From 8cbc6af6bcc7ab29a253924477fa1d154ec1a49d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 4 Oct 2023 23:02:14 +0300 Subject: [PATCH 018/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 6a62ae4f5..07368fed1 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -50,7 +50,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st if not isinstance(language, str) or not isinstance(text, str): return None lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} - return lang_profile[:2] + return lang_profile def calculate_mse(predicted: list, actual: list) -> float | None: From a566adcd4a10286bb935a35e7582c82d38860c1e Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 11:19:39 +0300 Subject: [PATCH 019/107] fixing import style checks --- lab_1_classify_by_unigrams/start.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 931f7fb68..01a8b728b 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,8 +1,7 @@ """ Language detection starter """ -from lab_1_classify_by_unigrams.main import create_language_profile -from lab_1_classify_by_unigrams.main import detect_language +import lab_1_classify_by_unigrams.main def main() -> None: @@ -11,16 +10,16 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_profile = create_language_profile('en', en_text) + en_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - de_profile = create_language_profile('de', de_text) + de_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - unknown_profile = create_language_profile('unknown', unknown_text) + unknown_profile = lab_1_classify_by_unigrams.main.create_language_profile('unknown', unknown_text) result = None assert result, "Detection result is None" - result = detect_language(unknown_profile, en_profile, de_profile) + result = lab_1_classify_by_unigrams.main.detect_language(unknown_profile, en_profile, de_profile) return result if __name__ == "__main__": From 5753e05151d0fc71ef94a7f8ad06cbcb0893feae Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 11:48:23 +0300 Subject: [PATCH 020/107] directly returning in detect_language, no rounding MSE, unused code removed --- lab_1_classify_by_unigrams/main.py | 30 ++++++++++++----------------- lab_1_classify_by_unigrams/start.py | 12 ++++++------ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 07368fed1..bda1d0c35 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -16,6 +16,7 @@ def tokenize(text: str) -> list[str] | None: tokens = [t for t in text.lower() if (t.isalpha() and t != 'º')] return tokens + def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ Calculates frequencies of given tokens @@ -68,7 +69,6 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for index, value in enumerate(predicted): sum_diff += (actual[index] - value) ** 2 mse = sum_diff / len(predicted) - mse = round(mse, 4) return mse @@ -89,13 +89,6 @@ def compare_profiles( if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None - #sorted_profile_to_compare - #predicted_tokens = dict(sorted(profile_to_compare['freq'].items())) - #sorted_unknown_profile - #actual_tokens = dict(sorted(unknown_profile['freq'].items())) - #sorted_profile_to_compare = tuple(sorted(profile_to_compare.items())) - #sorted_unknown_profile = tuple(sorted(unknown_profile.items())) - predicted_tokens = profile_to_compare.get('freq') actual_tokens = unknown_profile.get('freq') @@ -107,19 +100,21 @@ def compare_profiles( if key not in predicted_tokens: predicted_tokens[key] = 0 - sorted_predicted_tokens = dict(sorted(predicted_tokens.items())) - sorted_actual_tokens = dict(sorted(actual_tokens.items())) + sorted_predicted_tokens = (sorted(predicted_tokens.items())) + sorted_actual_tokens = (sorted(actual_tokens.items())) - predicted = list(sorted_predicted_tokens.values()) + predicted = [] + for item in sorted_predicted_tokens: + predicted.append(item[1]) - actual = list(sorted_actual_tokens.values()) + actual = [] + for item in sorted_actual_tokens: + actual.append(item[1]) mse = calculate_mse(predicted, actual) return mse - - def detect_language( unknown_profile: dict[str, str | dict[str, float]], profile_1: dict[str, str | dict[str, float]], @@ -141,12 +136,11 @@ def detect_language( if mse_1 == mse_2: names = [profile_1['name'], profile_2['name']] names.sort() - language = names[0] + return names[0] elif mse_1 < mse_2: - language = profile_1['name'] + return profile_1['name'] else: - language = profile_2['name'] - return language + return profile_2['name'] def load_profile(path_to_file: str) -> dict | None: diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 01a8b728b..1f8353124 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,7 +1,7 @@ """ Language detection starter """ -import lab_1_classify_by_unigrams.main +import lab_1_classify_by_unigrams.main as main_py def main() -> None: @@ -10,16 +10,16 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text) + en_profile = main_py.create_language_profile('en', en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - de_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text) + de_profile = main_py.create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - unknown_profile = lab_1_classify_by_unigrams.main.create_language_profile('unknown', unknown_text) - result = None + unknown_profile = main_py.create_language_profile('unknown', unknown_text) + #result = None + result = main_py.detect_language(unknown_profile, en_profile, de_profile) assert result, "Detection result is None" - result = lab_1_classify_by_unigrams.main.detect_language(unknown_profile, en_profile, de_profile) return result if __name__ == "__main__": From 32b11bdfc46597b70305eb92af940f6ea7cb77e2 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 11:51:48 +0300 Subject: [PATCH 021/107] code style fixed --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index bda1d0c35..3b9d814f2 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -137,7 +137,7 @@ def detect_language( names = [profile_1['name'], profile_2['name']] names.sort() return names[0] - elif mse_1 < mse_2: + if mse_1 < mse_2: return profile_1['name'] else: return profile_2['name'] From 4e3dd4c18837c3639b05f47221cbf134526c3cce Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 11:54:02 +0300 Subject: [PATCH 022/107] code style fixed --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 3b9d814f2..65ee5b8ee 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -139,7 +139,7 @@ def detect_language( return names[0] if mse_1 < mse_2: return profile_1['name'] - else: + if mse_1 > mse_2: return profile_2['name'] From 15ab62aed66bd2273c719793dca87c015afc8ea5 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:03:57 +0300 Subject: [PATCH 023/107] code style and mypy checks fixed --- lab_1_classify_by_unigrams/main.py | 2 +- lab_1_classify_by_unigrams/start.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 65ee5b8ee..dc6bef293 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -141,7 +141,7 @@ def detect_language( return profile_1['name'] if mse_1 > mse_2: return profile_2['name'] - + return None def load_profile(path_to_file: str) -> dict | None: """ diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 1f8353124..d6450a4e8 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -17,10 +17,8 @@ def main() -> None: with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() unknown_profile = main_py.create_language_profile('unknown', unknown_text) - #result = None result = main_py.detect_language(unknown_profile, en_profile, de_profile) assert result, "Detection result is None" - return result if __name__ == "__main__": main() From 473251505df1010d97401b02dfe1675af4e89e44 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:29:18 +0300 Subject: [PATCH 024/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index dc6bef293..1b70b0bae 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -50,8 +50,11 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} - return lang_profile + freq = calculate_frequencies(tokenize(text)) + if isinstance(freq, dict): + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + return lang_profile + return None def calculate_mse(predicted: list, actual: list) -> float | None: From a30590fdd25b523ebd5257c763db41fd2f176e9d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:32:29 +0300 Subject: [PATCH 025/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 1b70b0bae..b8e9557b4 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -51,10 +51,10 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st if not isinstance(language, str) or not isinstance(text, str): return None freq = calculate_frequencies(tokenize(text)) - if isinstance(freq, dict): - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} - return lang_profile - return None + if not isinstance(freq, dict): + return None + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + return lang_profile def calculate_mse(predicted: list, actual: list) -> float | None: From c777c8a647284736198c5ce6c10abc3051953e3d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:38:54 +0300 Subject: [PATCH 026/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b8e9557b4..17abcdacf 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -51,10 +51,10 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st if not isinstance(language, str) or not isinstance(text, str): return None freq = calculate_frequencies(tokenize(text)) - if not isinstance(freq, dict): - return None - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} - return lang_profile + if freq is not None: + lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + return lang_profile + return None def calculate_mse(predicted: list, actual: list) -> float | None: From 1cd496f7d7366abeb0e7fd075fdbc60ae94365cf Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:41:14 +0300 Subject: [PATCH 027/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 17abcdacf..1d7b37eec 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -52,7 +52,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st return None freq = calculate_frequencies(tokenize(text)) if freq is not None: - lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))} + lang_profile = {'name': language, 'freq': freq} return lang_profile return None From 4f3943a8497c7a289b116ec35cba61eee054457e Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:44:32 +0300 Subject: [PATCH 028/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 1d7b37eec..73ec89438 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -52,8 +52,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st return None freq = calculate_frequencies(tokenize(text)) if freq is not None: - lang_profile = {'name': language, 'freq': freq} - return lang_profile + return {'name': language, 'freq': freq} return None From 950adb65e5950568b7c2ef001144dd6eb250ad4b Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:49:07 +0300 Subject: [PATCH 029/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 73ec89438..4d14c56df 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -95,11 +95,11 @@ def compare_profiles( actual_tokens = unknown_profile.get('freq') for key in predicted_tokens.keys(): - if key not in actual_tokens: + if actual_tokens is not None and key not in actual_tokens: actual_tokens[key] = 0 for key in actual_tokens.keys(): - if key not in predicted_tokens: + if predicted_tokens is not None and key not in predicted_tokens: predicted_tokens[key] = 0 sorted_predicted_tokens = (sorted(predicted_tokens.items())) From 4280fbef42c75ab59d9c462e075c6d95d051d60f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 12:54:47 +0300 Subject: [PATCH 030/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 4d14c56df..b33666858 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -139,9 +139,9 @@ def detect_language( names = [profile_1['name'], profile_2['name']] names.sort() return names[0] - if mse_1 < mse_2: + if mse_1 is not None and mse_2 is not None and mse_1 < mse_2: return profile_1['name'] - if mse_1 > mse_2: + if mse_1 is not None and mse_2 is not None and mse_1 > mse_2: return profile_2['name'] return None From 0c74f93fe66502304c08d11d2468205b3e1bc43f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:11:25 +0300 Subject: [PATCH 031/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index b33666858..5b0f6f50a 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -138,13 +138,15 @@ def detect_language( if mse_1 == mse_2: names = [profile_1['name'], profile_2['name']] names.sort() - return names[0] - if mse_1 is not None and mse_2 is not None and mse_1 < mse_2: + if isinstance(names[0], str): + return names[0] + if mse_1 is not None and mse_2 is not None and mse_1 < mse_2 and isinstance(profile_1['name'], str): return profile_1['name'] - if mse_1 is not None and mse_2 is not None and mse_1 > mse_2: + if mse_1 is not None and mse_2 is not None and mse_1 > mse_2 and isinstance(profile_2['name'], str): return profile_2['name'] return None + def load_profile(path_to_file: str) -> dict | None: """ Loads a language profile From e3ad151c22e9bce3b0c643a2c083c073afd6fc64 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:16:45 +0300 Subject: [PATCH 032/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 5b0f6f50a..0689b7bd4 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -140,9 +140,11 @@ def detect_language( names.sort() if isinstance(names[0], str): return names[0] - if mse_1 is not None and mse_2 is not None and mse_1 < mse_2 and isinstance(profile_1['name'], str): + if (mse_1 is not None and mse_2 is not None and mse_1 < mse_2 and + isinstance(profile_1['name'], str)): return profile_1['name'] - if mse_1 is not None and mse_2 is not None and mse_1 > mse_2 and isinstance(profile_2['name'], str): + if (mse_1 is not None and mse_2 is not None and mse_1 > mse_2 and + isinstance(profile_2['name'], str)): return profile_2['name'] return None From 52a37684c034f4b13ee48d02be6c0f333200575f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:22:36 +0300 Subject: [PATCH 033/107] fixing mypy checks --- lab_1_classify_by_unigrams/start.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index d6450a4e8..d52b3aa1e 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -17,8 +17,9 @@ def main() -> None: with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() unknown_profile = main_py.create_language_profile('unknown', unknown_text) - result = main_py.detect_language(unknown_profile, en_profile, de_profile) - assert result, "Detection result is None" + if isinstance(unknown_profile, dict) and isinstance(en_profile, dict) and isinstance(de_profile, dict): + result = main_py.detect_language(unknown_profile, en_profile, de_profile) + assert result, "Detection result is None" if __name__ == "__main__": main() From 9cda9f641ee5deeaa498a1412eb5f9573d6cb0d0 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:26:08 +0300 Subject: [PATCH 034/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 6 ++++-- lab_1_classify_by_unigrams/start.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 0689b7bd4..a76a35334 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -96,11 +96,13 @@ def compare_profiles( for key in predicted_tokens.keys(): if actual_tokens is not None and key not in actual_tokens: - actual_tokens[key] = 0 + value = actual_tokens.get(key, 0) + actual_tokens[key] = value for key in actual_tokens.keys(): if predicted_tokens is not None and key not in predicted_tokens: - predicted_tokens[key] = 0 + value = predicted_tokens.get(key, 0) + predicted_tokens[key] = value sorted_predicted_tokens = (sorted(predicted_tokens.items())) sorted_actual_tokens = (sorted(actual_tokens.items())) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index d52b3aa1e..8104fe3d6 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -17,7 +17,9 @@ def main() -> None: with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() unknown_profile = main_py.create_language_profile('unknown', unknown_text) - if isinstance(unknown_profile, dict) and isinstance(en_profile, dict) and isinstance(de_profile, dict): + if (isinstance(unknown_profile, dict) and + isinstance(en_profile, dict) and + isinstance(de_profile, dict)): result = main_py.detect_language(unknown_profile, en_profile, de_profile) assert result, "Detection result is None" From 1a9e5e993b2c7edfa8759f2676de85c6482763de Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:31:58 +0300 Subject: [PATCH 035/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index a76a35334..426f141f1 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -95,12 +95,12 @@ def compare_profiles( actual_tokens = unknown_profile.get('freq') for key in predicted_tokens.keys(): - if actual_tokens is not None and key not in actual_tokens: + if actual_tokens is not None: value = actual_tokens.get(key, 0) actual_tokens[key] = value for key in actual_tokens.keys(): - if predicted_tokens is not None and key not in predicted_tokens: + if predicted_tokens is not None: value = predicted_tokens.get(key, 0) predicted_tokens[key] = value From a4aa4dc0fb49b13978cdd474ac15e942aa79bbe3 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 13:33:00 +0300 Subject: [PATCH 036/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 426f141f1..5ccfb27df 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -95,12 +95,12 @@ def compare_profiles( actual_tokens = unknown_profile.get('freq') for key in predicted_tokens.keys(): - if actual_tokens is not None: + if actual_tokens is not None and key not in actual_tokens: value = actual_tokens.get(key, 0) actual_tokens[key] = value for key in actual_tokens.keys(): - if predicted_tokens is not None: + if predicted_tokens is not None and key not in predicted_tokens: value = predicted_tokens.get(key, 0) predicted_tokens[key] = value From 2ef36140abeca3ecadb58605f1687bf94ee11ad4 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 15:08:08 +0300 Subject: [PATCH 037/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 43 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 5ccfb27df..84fb6cc32 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -91,29 +91,36 @@ def compare_profiles( if 'name' not in profile_to_compare or 'freq' not in profile_to_compare: return None - predicted_tokens = profile_to_compare.get('freq') - actual_tokens = unknown_profile.get('freq') + predicted_tokens = profile_to_compare.get('freq').keys() + actual_tokens = unknown_profile.get('freq').keys() - for key in predicted_tokens.keys(): - if actual_tokens is not None and key not in actual_tokens: - value = actual_tokens.get(key, 0) - actual_tokens[key] = value + #for key in predicted_tokens.keys(): + #if actual_tokens is not None and key not in actual_tokens: + #value = actual_tokens.get(key, 0) + #actual_tokens[key] = value - for key in actual_tokens.keys(): - if predicted_tokens is not None and key not in predicted_tokens: - value = predicted_tokens.get(key, 0) - predicted_tokens[key] = value - - sorted_predicted_tokens = (sorted(predicted_tokens.items())) - sorted_actual_tokens = (sorted(actual_tokens.items())) + #for key in actual_tokens.keys(): + #if predicted_tokens is not None and key not in predicted_tokens: + #value = predicted_tokens.get(key, 0) + #predicted_tokens[key] = value + tokens = set(actual_tokens).union(set(predicted_tokens)) predicted = [] - for item in sorted_predicted_tokens: - predicted.append(item[1]) - actual = [] - for item in sorted_actual_tokens: - actual.append(item[1]) + for token in tokens: + predicted.append(profile_to_compare.get('freq').get(token, 0)) + actual.append(unknown_profile.get('freq').get(token, 0)) + + #sorted_predicted_tokens = (sorted(predicted_tokens.items())) + #sorted_actual_tokens = (sorted(actual_tokens.items())) + + #predicted = [] + #for item in sorted_predicted_tokens: + #predicted.append(item[1]) + + #actual = [] + #for item in sorted_actual_tokens: + #actual.append(item[1]) mse = calculate_mse(predicted, actual) return mse From f7b17c37af2702ca3033d01f31720308e09b13d4 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 5 Oct 2023 15:14:56 +0300 Subject: [PATCH 038/107] fixing mypy checks --- lab_1_classify_by_unigrams/main.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 84fb6cc32..3f0c3894d 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -94,16 +94,6 @@ def compare_profiles( predicted_tokens = profile_to_compare.get('freq').keys() actual_tokens = unknown_profile.get('freq').keys() - #for key in predicted_tokens.keys(): - #if actual_tokens is not None and key not in actual_tokens: - #value = actual_tokens.get(key, 0) - #actual_tokens[key] = value - - #for key in actual_tokens.keys(): - #if predicted_tokens is not None and key not in predicted_tokens: - #value = predicted_tokens.get(key, 0) - #predicted_tokens[key] = value - tokens = set(actual_tokens).union(set(predicted_tokens)) predicted = [] actual = [] @@ -111,17 +101,6 @@ def compare_profiles( predicted.append(profile_to_compare.get('freq').get(token, 0)) actual.append(unknown_profile.get('freq').get(token, 0)) - #sorted_predicted_tokens = (sorted(predicted_tokens.items())) - #sorted_actual_tokens = (sorted(actual_tokens.items())) - - #predicted = [] - #for item in sorted_predicted_tokens: - #predicted.append(item[1]) - - #actual = [] - #for item in sorted_actual_tokens: - #actual.append(item[1]) - mse = calculate_mse(predicted, actual) return mse From 1bf2df16b875a26b28d2c5fa020b70459cba4561 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 10 Oct 2023 14:19:05 +0300 Subject: [PATCH 039/107] prepare_word and collect_frequencies functions done --- lab_2_tokenize_by_bpe/main.py | 40 ++++++++++++++++++++++++++ lab_2_tokenize_by_bpe/target_score.txt | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 620a4d645..ee32a00c4 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -14,6 +14,32 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ + if not isinstance(raw_word, str): + return None + if not isinstance(start_of_word, str) and start_of_word is not None: + return None + if not isinstance(end_of_word, str) and end_of_word is not None: + return None + + if start_of_word is None and end_of_word is None: + return tuple(raw_word) + elif start_of_word is None and end_of_word is not None: + symbol_list = [] + for symbol in raw_word: + symbol_list.append(symbol) + symbol_list.append(end_of_word) + return tuple(symbol_list) + elif start_of_word is not None and end_of_word is None: + symbol_list = [start_of_word] + for symbol in raw_word: + symbol_list.append(symbol) + return tuple(symbol_list) + else: + symbol_list = [start_of_word] + for symbol in raw_word: + symbol_list.append(symbol) + symbol_list.append(end_of_word) + return tuple(symbol_list) def collect_frequencies( @@ -26,6 +52,20 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ + if not isinstance(text, str) or not isinstance(end_of_word, str): + return None + if not isinstance(start_of_word, str) and start_of_word is not None: + return None + + dict_freq = {} + words_list = text.split() + for word in words_list: + prepared_word = prepare_word(word, None, '') + if prepared_word is None: + return None + dict_freq[prepared_word] = words_list.count(word) + + return dict_freq def count_tokens_pairs( diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 573541ac9..b8626c4cf 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -0 +4 From 0cd0fb3ceeabccc4d0db30302e61c084306ce930 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 10 Oct 2023 14:36:52 +0300 Subject: [PATCH 040/107] fixing check start.py --- lab_2_tokenize_by_bpe/start.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 798e957e0..07eb13bb1 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -2,7 +2,7 @@ BPE Tokenizer starter """ from pathlib import Path - +import lab_2_tokenize_by_bpe.main as main_py def main() -> None: """ @@ -11,6 +11,7 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() + main_py.collect_frequencies(text, None, '') result = None assert result, "Encoding is not working" From e7285cef09eb23f0279a4032f900db682294b6d1 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 10 Oct 2023 14:39:15 +0300 Subject: [PATCH 041/107] fixing check start.py --- lab_2_tokenize_by_bpe/start.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 07eb13bb1..ac2c6d5d5 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -11,9 +11,8 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - main_py.collect_frequencies(text, None, '') - result = None + result = main_py.collect_ngrams(text, None, '') assert result, "Encoding is not working" From 32b301a40b89c423bb098c96f63e3a56aaa08f8d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 10 Oct 2023 14:41:09 +0300 Subject: [PATCH 042/107] fixing check start.py --- lab_2_tokenize_by_bpe/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index ac2c6d5d5..46d5a94c5 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -12,7 +12,7 @@ def main() -> None: with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - result = main_py.collect_ngrams(text, None, '') + result = main_py.collect_frequencies(text, None, '') assert result, "Encoding is not working" From 8cb26a72fe76f13cb67f3cdd3cd40c62a7c0fbc6 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Wed, 11 Oct 2023 11:02:15 +0300 Subject: [PATCH 043/107] checkout labs from the origin repository --- lab_1_classify_by_unigrams/start.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index b9e11c06d..4a17442d0 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,7 +1,6 @@ """ Language detection starter """ -import lab_1_classify_by_unigrams.main as main_py from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile, detect_language_advanced, print_report) @@ -13,10 +12,8 @@ def main() -> None: """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_profile = main_py.create_language_profile('en', en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - de_profile = main_py.create_language_profile('de', de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() From 581905ef3fa70ba7b533938573b22380a50b0ea5 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 19 Oct 2023 06:28:37 +0300 Subject: [PATCH 044/107] steps 3-5 done --- lab_2_tokenize_by_bpe/main.py | 137 ++++++++++++++++++++----- lab_2_tokenize_by_bpe/start.py | 3 +- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 112 insertions(+), 30 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index ee32a00c4..ed3928a8b 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -5,7 +5,7 @@ def prepare_word( - raw_word: str, start_of_word: str | None, end_of_word: str | None + raw_word: str, start_of_word: str | None, end_of_word: str | None ) -> tuple[str, ...] | None: """ Tokenizes word into unigrams and appends end-of-word token @@ -21,29 +21,25 @@ def prepare_word( if not isinstance(end_of_word, str) and end_of_word is not None: return None + symbol_list = [] + for symbol in raw_word: + symbol_list.append(symbol) + if start_of_word is None and end_of_word is None: return tuple(raw_word) elif start_of_word is None and end_of_word is not None: - symbol_list = [] - for symbol in raw_word: - symbol_list.append(symbol) symbol_list.append(end_of_word) return tuple(symbol_list) elif start_of_word is not None and end_of_word is None: - symbol_list = [start_of_word] - for symbol in raw_word: - symbol_list.append(symbol) - return tuple(symbol_list) - else: - symbol_list = [start_of_word] - for symbol in raw_word: - symbol_list.append(symbol) - symbol_list.append(end_of_word) + symbol_list.insert(0, start_of_word) return tuple(symbol_list) + symbol_list.insert(0, start_of_word) + symbol_list.append(end_of_word) + return tuple(symbol_list) def collect_frequencies( - text: str, start_of_word: str | None, end_of_word: str + text: str, start_of_word: str | None, end_of_word: str ) -> dict[tuple[str, ...], int] | None: """ Counts number of occurrences of each word @@ -60,26 +56,51 @@ def collect_frequencies( dict_freq = {} words_list = text.split() for word in words_list: - prepared_word = prepare_word(word, None, '') + prepared_word = prepare_word(word, start_of_word, end_of_word) if prepared_word is None: return None - dict_freq[prepared_word] = words_list.count(word) + if prepared_word not in dict_freq: + dict_freq[prepared_word] = words_list.count(word) return dict_freq def count_tokens_pairs( - word_frequencies: dict[tuple[str, ...], int] + word_frequencies: dict[tuple[str, ...], int] ) -> dict[tuple[str, str], int] | None: """ Counts number of occurrences of each pair of subsequent tokens :param word_frequencies: dictionary in the form of :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict): + return None + + pair_dict = {} + + for word in word_frequencies: + for index, token in enumerate(word): + if index == len(word) - 1: + break + next_token_index = index + 1 + pair = (token, word[next_token_index]) + pair_dict[pair] = 0 + + for pair in pair_dict.copy(): + freq = 0 + for word in word_frequencies: + for index, token in enumerate(word): + if index == len(word) - 1: + break + if token == pair[0] and word[index + 1] == pair[1]: + freq += word_frequencies[word] + pair_dict[pair] = freq + + return pair_dict def merge_tokens( - word_frequencies: dict[tuple[str, ...], int], pair: tuple[str, str] + word_frequencies: dict[tuple[str, ...], int], pair: tuple[str, str] ) -> dict[tuple[str, ...], int] | None: """ Updates word frequency dictionary by replacing a pair of token with a merged one @@ -87,10 +108,32 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): + return None + + word_freq_updated = word_frequencies.copy() + + for word in word_frequencies: + new_word = [] + if pair[0] in word and pair[1] in word: + for index, token in enumerate(word): + if token == pair[1] and word[index - 1] == pair[0]: + pass + elif token != pair[0]: + new_word.append(token) + elif token == pair[0] and word[index + 1] != pair[1]: + new_word.append(token) + elif token == pair[0] and word[index + 1] == pair[1]: + new_word.append(pair[0] + pair[1]) + + value = word_freq_updated.pop(word) + word_freq_updated[tuple(new_word)] = value + + return word_freq_updated def train( - word_frequencies: dict[tuple[str, ...], int] | None, num_merges: int + word_frequencies: dict[tuple[str, ...], int] | None, num_merges: int ) -> dict[tuple[str, ...], int] | None: """ Creates required number of new tokens by merging existing ones @@ -98,10 +141,48 @@ def train( :param num_merges: required number of new tokens :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): + return None + + for i in range(num_merges): + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: + return None + max_value = max(list(pair_dict.values())) + pair_list = [key for key, value in pair_dict.items() if value == max_value] + if len(pair_list) > 1: + len_list = [] + for pair in pair_list: + keys = list(word_frequencies.keys()) + if pair[1] == keys[0][-1]: + pair_len = len(pair[0]) + 1 + elif keys[0][-1] in pair[0]: + pair_len = len(pair[0]) - len(keys[0][-1]) + 1 + len(pair[1]) + elif keys[0][-1] in pair[1]: + pair_len = len(pair[0]) + len(pair[1]) - len(keys[0][-1]) + 1 + else: + pair_len = len(pair[0]) + len(pair[1]) + len_list.append(pair_len) + if len_list.count(max(len_list)) > 1: + max_len_pairs = [] + for j in range(len_list.count(max(len_list))): + max_len_pairs.append(pair_list[len_list.index(max(len_list))]) + pair_list.pop(len_list.index(max(len_list))) + len_list.pop(len_list.index(max(len_list))) + max_len_pairs.sort() + word_frequencies = merge_tokens(word_frequencies, max_len_pairs[0]) + else: + word_frequencies = merge_tokens(word_frequencies, pair_list[len_list.index(max(len_list))]) + elif len(pair_list) == 1: + word_frequencies = merge_tokens(word_frequencies, pair_list[0]) + if word_frequencies is None: + return None + + return word_frequencies def get_vocabulary( - word_frequencies: dict[tuple[str, ...], int], unknown_token: str + word_frequencies: dict[tuple[str, ...], int], unknown_token: str ) -> dict[str, int] | None: """ Establishes correspondence between tokens and its integer identifier @@ -112,7 +193,7 @@ def get_vocabulary( def decode( - encoded_text: list[int] | None, vocabulary: dict[str, int] | None, end_of_word_token: str | None + encoded_text: list[int] | None, vocabulary: dict[str, int] | None, end_of_word_token: str | None ) -> str | None: """ Translates encoded sequence into decoded one @@ -124,7 +205,7 @@ def decode( def tokenize_word( - word: tuple[str, ...], vocabulary: dict[str, int], end_of_word: str | None, unknown_token: str + word: tuple[str, ...], vocabulary: dict[str, int], end_of_word: str | None, unknown_token: str ) -> list[int] | None: """ Splits word into tokens @@ -145,11 +226,11 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: def encode( - original_text: str, - vocabulary: dict[str, int] | None, - start_of_word_token: str | None, - end_of_word_token: str | None, - unknown_token: str, + original_text: str, + vocabulary: dict[str, int] | None, + start_of_word_token: str | None, + end_of_word_token: str | None, + unknown_token: str, ) -> list[int] | None: """ Translates decoded sequence into encoded one @@ -172,7 +253,7 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: def calculate_precision( - actual: list[tuple[str, ...]], reference: list[tuple[str, ...]] + actual: list[tuple[str, ...]], reference: list[tuple[str, ...]] ) -> float | None: """ Compares two sequences by virtue of Precision metric diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 46d5a94c5..0f0661d33 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -12,7 +12,8 @@ def main() -> None: with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - result = main_py.collect_frequencies(text, None, '') + word_frequencies = main_py.collect_frequencies(text, None, '') + result = main_py.train(word_frequencies, 100) assert result, "Encoding is not working" diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index b8626c4cf..1e8b31496 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -4 +6 From ae180553c2d4d86c6eb77b09f322aaca5daf6c21 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 19 Oct 2023 06:34:29 +0300 Subject: [PATCH 045/107] import style check fixed --- lab_2_tokenize_by_bpe/start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 0f0661d33..b5e54dc69 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -4,6 +4,7 @@ from pathlib import Path import lab_2_tokenize_by_bpe.main as main_py + def main() -> None: """ Launches an implementation From 4a226cb500a8892013287223bf0d8c4839239a46 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 19 Oct 2023 06:36:38 +0300 Subject: [PATCH 046/107] import style check fixed --- lab_2_tokenize_by_bpe/start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index b5e54dc69..b80ceb107 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -2,6 +2,7 @@ BPE Tokenizer starter """ from pathlib import Path + import lab_2_tokenize_by_bpe.main as main_py From 618d59d449172c20aaa90a2e810dbe3983f985a7 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 25 Oct 2023 16:58:35 +0300 Subject: [PATCH 047/107] rewrote train function --- lab_2_tokenize_by_bpe/main.py | 93 +++++++++++++---------------------- 1 file changed, 34 insertions(+), 59 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index ed3928a8b..6bf1c2d92 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -21,25 +21,17 @@ def prepare_word( if not isinstance(end_of_word, str) and end_of_word is not None: return None - symbol_list = [] - for symbol in raw_word: - symbol_list.append(symbol) + symbol_list = list(raw_word) - if start_of_word is None and end_of_word is None: - return tuple(raw_word) - elif start_of_word is None and end_of_word is not None: - symbol_list.append(end_of_word) - return tuple(symbol_list) - elif start_of_word is not None and end_of_word is None: + if start_of_word is not None: symbol_list.insert(0, start_of_word) - return tuple(symbol_list) - symbol_list.insert(0, start_of_word) - symbol_list.append(end_of_word) + if end_of_word is not None: + symbol_list.append(end_of_word) return tuple(symbol_list) def collect_frequencies( - text: str, start_of_word: str | None, end_of_word: str + text: str, start_of_word: str | None, end_of_word: str ) -> dict[tuple[str, ...], int] | None: """ Counts number of occurrences of each word @@ -79,22 +71,11 @@ def count_tokens_pairs( pair_dict = {} for word in word_frequencies: - for index, token in enumerate(word): - if index == len(word) - 1: - break - next_token_index = index + 1 - pair = (token, word[next_token_index]) - pair_dict[pair] = 0 - - for pair in pair_dict.copy(): - freq = 0 - for word in word_frequencies: - for index, token in enumerate(word): - if index == len(word) - 1: - break - if token == pair[0] and word[index + 1] == pair[1]: - freq += word_frequencies[word] - pair_dict[pair] = freq + for index in range(len(word) - 1): + pair = (word[index], word[index + 1]) + if pair not in pair_dict: + pair_dict[pair] = 0 + pair_dict[pair] += word_frequencies[word] return pair_dict @@ -144,40 +125,34 @@ def train( if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None - for i in range(num_merges): - pair_dict = count_tokens_pairs(word_frequencies) - if pair_dict is None: - return None - max_value = max(list(pair_dict.values())) + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: + return None + + for _ in range(num_merges): + max_value = max(pair_dict.values()) pair_list = [key for key, value in pair_dict.items() if value == max_value] - if len(pair_list) > 1: - len_list = [] - for pair in pair_list: - keys = list(word_frequencies.keys()) - if pair[1] == keys[0][-1]: - pair_len = len(pair[0]) + 1 - elif keys[0][-1] in pair[0]: - pair_len = len(pair[0]) - len(keys[0][-1]) + 1 + len(pair[1]) - elif keys[0][-1] in pair[1]: - pair_len = len(pair[0]) + len(pair[1]) - len(keys[0][-1]) + 1 - else: - pair_len = len(pair[0]) + len(pair[1]) - len_list.append(pair_len) - if len_list.count(max(len_list)) > 1: - max_len_pairs = [] - for j in range(len_list.count(max(len_list))): - max_len_pairs.append(pair_list[len_list.index(max(len_list))]) - pair_list.pop(len_list.index(max(len_list))) - len_list.pop(len_list.index(max(len_list))) - max_len_pairs.sort() - word_frequencies = merge_tokens(word_frequencies, max_len_pairs[0]) - else: - word_frequencies = merge_tokens(word_frequencies, pair_list[len_list.index(max(len_list))]) - elif len(pair_list) == 1: - word_frequencies = merge_tokens(word_frequencies, pair_list[0]) + + len_list = [] + for pair in pair_list: + pair_len = len(''.join(list(pair))) + len_list.append(pair_len) + max_len = max(len_list) + + max_len_pairs = [] + for pair in pair_list: + if max_len == len(''.join(pair)): + max_len_pairs.append(pair) + + word_frequencies = merge_tokens(word_frequencies, sorted(max_len_pairs)[0]) if word_frequencies is None: return None + pair_dict.pop(sorted(max_len_pairs)[0]) + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: + return None + return word_frequencies From a524c36358a45bbfa58b2c0ea7ee6b3733bde99a Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 1 Nov 2023 23:45:28 +0300 Subject: [PATCH 048/107] steps 6 and 7 done, some changes done in train function --- lab_2_tokenize_by_bpe/main.py | 78 ++++++++++++++++++-------- lab_2_tokenize_by_bpe/start.py | 7 ++- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 6bf1c2d92..67134fcb4 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -5,7 +5,7 @@ def prepare_word( - raw_word: str, start_of_word: str | None, end_of_word: str | None + raw_word: str, start_of_word: str | None, end_of_word: str | None ) -> tuple[str, ...] | None: """ Tokenizes word into unigrams and appends end-of-word token @@ -130,28 +130,19 @@ def train( return None for _ in range(num_merges): - max_value = max(pair_dict.values()) - pair_list = [key for key, value in pair_dict.items() if value == max_value] - - len_list = [] - for pair in pair_list: - pair_len = len(''.join(list(pair))) - len_list.append(pair_len) - max_len = max(len_list) - - max_len_pairs = [] - for pair in pair_list: - if max_len == len(''.join(pair)): - max_len_pairs.append(pair) - - word_frequencies = merge_tokens(word_frequencies, sorted(max_len_pairs)[0]) - if word_frequencies is None: - return None + if pair_dict != {}: + max_value = max(pair_dict.values()) + pair_list = [key for key, value in pair_dict.items() if value == max_value] - pair_dict.pop(sorted(max_len_pairs)[0]) - pair_dict = count_tokens_pairs(word_frequencies) - if pair_dict is None: - return None + pair_list.sort() + pair_to_merge = max(pair_list, key=lambda x: len(x[0] + x[1])) + + word_frequencies = merge_tokens(word_frequencies, pair_to_merge) + + pair_dict.pop(pair_to_merge) + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: + return None return word_frequencies @@ -165,6 +156,24 @@ def get_vocabulary( :param unknown_token: a token to signify an unknown token :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): + return None + + tokens_list = [unknown_token] + for word in word_frequencies: + tokens_list.append(word[0]) + for token in word[0]: + if token not in tokens_list: + tokens_list.append(token) + + tokens_list.sort() + tokens_list.sort(key=len, reverse=True) + + tokens_dict = {} + for index, token in enumerate(tokens_list): + tokens_dict[token] = index + + return tokens_dict def decode( @@ -177,6 +186,31 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ + if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict): + return None + if not isinstance(end_of_word_token, str) and end_of_word_token is not None: + return None + + decoded_text = '' + for identifier in encoded_text: + for token in vocabulary: + if end_of_word_token is not None: + if vocabulary[token] == identifier and end_of_word_token in token: + new_token = '' + for element in token: + if element not in end_of_word_token: + new_token += element + if element in end_of_word_token: + break + decoded_text += new_token + decoded_text += ' ' + elif vocabulary[token] == identifier and end_of_word_token not in token: + decoded_text += token + else: + if vocabulary[token] == identifier: + decoded_text += token + + return decoded_text def tokenize_word( diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index b80ceb107..e3a3c7f43 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,7 +15,12 @@ def main() -> None: text = text_file.read() word_frequencies = main_py.collect_frequencies(text, None, '') - result = main_py.train(word_frequencies, 100) + vocabulary = main_py.get_vocabulary(word_frequencies, '') + + with open(assets_path / 'secret.txt', 'r', encoding='utf-8') as secret_file: + secret = secret_file.read() + + result = main_py.decode(secret.split(' '), vocabulary, '') assert result, "Encoding is not working" diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -6 +8 From 9828edd32d238863fa919700b478e1dd13586f06 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 1 Nov 2023 23:50:51 +0300 Subject: [PATCH 049/107] start.py fixed --- lab_2_tokenize_by_bpe/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index e3a3c7f43..21b03b9fd 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -17,7 +17,7 @@ def main() -> None: word_frequencies = main_py.collect_frequencies(text, None, '') vocabulary = main_py.get_vocabulary(word_frequencies, '') - with open(assets_path / 'secret.txt', 'r', encoding='utf-8') as secret_file: + with open(assets_path / 'secrets' / 'secret.txt', 'r', encoding='utf-8') as secret_file: secret = secret_file.read() result = main_py.decode(secret.split(' '), vocabulary, '') From 554696c8ac2a5ac6ba056bd751b277d78db6c5c4 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 1 Nov 2023 23:53:57 +0300 Subject: [PATCH 050/107] start.py fixed --- lab_2_tokenize_by_bpe/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 21b03b9fd..52512aea4 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -17,7 +17,7 @@ def main() -> None: word_frequencies = main_py.collect_frequencies(text, None, '') vocabulary = main_py.get_vocabulary(word_frequencies, '') - with open(assets_path / 'secrets' / 'secret.txt', 'r', encoding='utf-8') as secret_file: + with open(assets_path / 'secrets' / 'secret_1.txt', 'r', encoding='utf-8') as secret_file: secret = secret_file.read() result = main_py.decode(secret.split(' '), vocabulary, '') From d19294b8178a8d047b926ded10cd524ef4371ed2 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 1 Nov 2023 23:58:04 +0300 Subject: [PATCH 051/107] start.py fixed --- lab_2_tokenize_by_bpe/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 52512aea4..32aaa9fbf 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -17,7 +17,7 @@ def main() -> None: word_frequencies = main_py.collect_frequencies(text, None, '') vocabulary = main_py.get_vocabulary(word_frequencies, '') - with open(assets_path / 'secrets' / 'secret_1.txt', 'r', encoding='utf-8') as secret_file: + with open(assets_path / 'secrets' / 'secret_1.txt', 'r') as secret_file: secret = secret_file.read() result = main_py.decode(secret.split(' '), vocabulary, '') From 840ef987924df4e621ea8160446b4677f53d412d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 00:09:27 +0300 Subject: [PATCH 052/107] start.py, mypy and code style fixed --- lab_2_tokenize_by_bpe/main.py | 44 ++++++++++++++++++---------------- lab_2_tokenize_by_bpe/start.py | 5 ++-- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 67134fcb4..c0a892a15 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -137,10 +137,12 @@ def train( pair_list.sort() pair_to_merge = max(pair_list, key=lambda x: len(x[0] + x[1])) - word_frequencies = merge_tokens(word_frequencies, pair_to_merge) + if isinstance(word_frequencies, dict): + word_frequencies = merge_tokens(word_frequencies, pair_to_merge) pair_dict.pop(pair_to_merge) - pair_dict = count_tokens_pairs(word_frequencies) + if isinstance(word_frequencies, dict): + pair_dict = count_tokens_pairs(word_frequencies) if pair_dict is None: return None @@ -177,7 +179,8 @@ def get_vocabulary( def decode( - encoded_text: list[int] | None, vocabulary: dict[str, int] | None, end_of_word_token: str | None + encoded_text: list[int] | None, vocabulary: dict[str, int] | None + , end_of_word_token: str | None ) -> str | None: """ Translates encoded sequence into decoded one @@ -186,7 +189,9 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict): + if not isinstance(encoded_text, list) and encoded_text is not None: + return None + if not isinstance(vocabulary, dict): return None if not isinstance(end_of_word_token, str) and end_of_word_token is not None: return None @@ -194,27 +199,26 @@ def decode( decoded_text = '' for identifier in encoded_text: for token in vocabulary: - if end_of_word_token is not None: - if vocabulary[token] == identifier and end_of_word_token in token: - new_token = '' - for element in token: - if element not in end_of_word_token: - new_token += element - if element in end_of_word_token: - break - decoded_text += new_token - decoded_text += ' ' - elif vocabulary[token] == identifier and end_of_word_token not in token: - decoded_text += token - else: - if vocabulary[token] == identifier: - decoded_text += token + if end_of_word_token is None and vocabulary[token] == identifier: + decoded_text += token + elif vocabulary[token] == identifier and end_of_word_token in token: + new_token = '' + for element in token: + if element not in end_of_word_token: + new_token += element + if element in end_of_word_token: + break + decoded_text += new_token + decoded_text += ' ' + elif vocabulary[token] == identifier and end_of_word_token not in token: + decoded_text += token return decoded_text def tokenize_word( - word: tuple[str, ...], vocabulary: dict[str, int], end_of_word: str | None, unknown_token: str + word: tuple[str, ...], vocabulary: dict[str, int] + , end_of_word: str | None, unknown_token: str ) -> list[int] | None: """ Splits word into tokens diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 32aaa9fbf..9b2fae94d 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,9 +15,10 @@ def main() -> None: text = text_file.read() word_frequencies = main_py.collect_frequencies(text, None, '') - vocabulary = main_py.get_vocabulary(word_frequencies, '') + if isinstance(word_frequencies, dict): + vocabulary = main_py.get_vocabulary(word_frequencies, '') - with open(assets_path / 'secrets' / 'secret_1.txt', 'r') as secret_file: + with open(assets_path / 'secrets' / 'secret_1.txt', 'r', encoding='utf-8') as secret_file: secret = secret_file.read() result = main_py.decode(secret.split(' '), vocabulary, '') From 1179deb6b3e183c8e45931d38deb3c5832014827 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 14:16:40 +0300 Subject: [PATCH 053/107] fixing unittests --- lab_2_tokenize_by_bpe/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index c0a892a15..aa7a99678 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -189,7 +189,7 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list) and encoded_text is not None: + if not isinstance(encoded_text, list): return None if not isinstance(vocabulary, dict): return None From d841924f6f7cd6e2a826a1220f4bc2325284dfe0 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 14:56:56 +0300 Subject: [PATCH 054/107] start.py fixing --- lab_2_tokenize_by_bpe/start.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 9b2fae94d..c87e201bd 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,6 +15,10 @@ def main() -> None: text = text_file.read() word_frequencies = main_py.collect_frequencies(text, None, '') + token_pairs = main_py.count_tokens_pairs(word_frequencies) + num_merges = len(token_pairs) + word_frequencies = main_py.train(word_frequencies, num_merges) + if isinstance(word_frequencies, dict): vocabulary = main_py.get_vocabulary(word_frequencies, '') From 728c2579089465a2b2b9af89aae888ec1a35f39a Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 21:28:26 +0300 Subject: [PATCH 055/107] start.py fixing --- lab_2_tokenize_by_bpe/main.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index aa7a99678..9cca774cc 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -5,7 +5,7 @@ def prepare_word( - raw_word: str, start_of_word: str | None, end_of_word: str | None + raw_word: str, start_of_word: str | None, end_of_word: str | None ) -> tuple[str, ...] | None: """ Tokenizes word into unigrams and appends end-of-word token @@ -31,7 +31,7 @@ def prepare_word( def collect_frequencies( - text: str, start_of_word: str | None, end_of_word: str + text: str, start_of_word: str | None, end_of_word: str ) -> dict[tuple[str, ...], int] | None: """ Counts number of occurrences of each word @@ -197,23 +197,27 @@ def decode( return None decoded_text = '' - for identifier in encoded_text: - for token in vocabulary: - if end_of_word_token is None and vocabulary[token] == identifier: - decoded_text += token - elif vocabulary[token] == identifier and end_of_word_token in token: + inv_vocabulary = {v: k for k, v in vocabulary.items()} + + if end_of_word_token is None: + for identifier in encoded_text: + decoded_text += inv_vocabulary[identifier] + return decoded_text + + elif end_of_word_token is not None: + for identifier in encoded_text: + if end_of_word_token in inv_vocabulary[identifier]: new_token = '' - for element in token: + for element in inv_vocabulary[identifier]: if element not in end_of_word_token: new_token += element if element in end_of_word_token: break decoded_text += new_token decoded_text += ' ' - elif vocabulary[token] == identifier and end_of_word_token not in token: - decoded_text += token - - return decoded_text + elif end_of_word_token not in inv_vocabulary[identifier]: + decoded_text += inv_vocabulary[identifier] + return decoded_text def tokenize_word( From c6924747c5b7b94ad021027ca0044cf364032663 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 21:31:17 +0300 Subject: [PATCH 056/107] code style fixing --- lab_2_tokenize_by_bpe/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 9cca774cc..c5a3852bb 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -204,7 +204,7 @@ def decode( decoded_text += inv_vocabulary[identifier] return decoded_text - elif end_of_word_token is not None: + if end_of_word_token is not None: for identifier in encoded_text: if end_of_word_token in inv_vocabulary[identifier]: new_token = '' From 98daf2c547d3657e69d1e1b3ae370a4ad6acddfc Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 21:34:40 +0300 Subject: [PATCH 057/107] code style fixing --- lab_2_tokenize_by_bpe/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index c5a3852bb..2922a7b44 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -219,6 +219,7 @@ def decode( decoded_text += inv_vocabulary[identifier] return decoded_text + return None def tokenize_word( word: tuple[str, ...], vocabulary: dict[str, int] From 5923e51501ddc6dc66d32110c2037b70332a5dca Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 21:34:53 +0300 Subject: [PATCH 058/107] code style fixing --- lab_2_tokenize_by_bpe/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 2922a7b44..0ab39e426 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -221,6 +221,7 @@ def decode( return None + def tokenize_word( word: tuple[str, ...], vocabulary: dict[str, int] , end_of_word: str | None, unknown_token: str From da4b6fd37f6665da0e41059d5a6a97dc3928d33b Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 21:52:50 +0300 Subject: [PATCH 059/107] mypy fixing --- lab_2_tokenize_by_bpe/main.py | 6 +++--- lab_2_tokenize_by_bpe/start.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 0ab39e426..0b5394ddd 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -189,7 +189,7 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list): + if not isinstance(encoded_text, list) and encoded_text is not None: return None if not isinstance(vocabulary, dict): return None @@ -199,12 +199,12 @@ def decode( decoded_text = '' inv_vocabulary = {v: k for k, v in vocabulary.items()} - if end_of_word_token is None: + if end_of_word_token is None and encoded_text is not None: for identifier in encoded_text: decoded_text += inv_vocabulary[identifier] return decoded_text - if end_of_word_token is not None: + if end_of_word_token is not None and encoded_text is not None: for identifier in encoded_text: if end_of_word_token in inv_vocabulary[identifier]: new_token = '' diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index c87e201bd..43219f210 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,14 +15,19 @@ def main() -> None: text = text_file.read() word_frequencies = main_py.collect_frequencies(text, None, '') - token_pairs = main_py.count_tokens_pairs(word_frequencies) - num_merges = len(token_pairs) - word_frequencies = main_py.train(word_frequencies, num_merges) + vocabulary = {} + token_pairs = {} + num_merges = 0 + if isinstance(word_frequencies, dict): + token_pairs = main_py.count_tokens_pairs(word_frequencies) + if isinstance(token_pairs, dict): + num_merges = len(token_pairs) + word_frequencies = main_py.train(word_frequencies, num_merges) if isinstance(word_frequencies, dict): vocabulary = main_py.get_vocabulary(word_frequencies, '') - with open(assets_path / 'secrets' / 'secret_1.txt', 'r', encoding='utf-8') as secret_file: + with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: secret = secret_file.read() result = main_py.decode(secret.split(' '), vocabulary, '') From a7715e09abd457a778b505c0e86947812b4f632d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 23:14:17 +0300 Subject: [PATCH 060/107] mypy fixing --- lab_2_tokenize_by_bpe/main.py | 23 +++++++++++++---------- lab_2_tokenize_by_bpe/start.py | 20 ++++++++------------ 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 0b5394ddd..d9026d1c3 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -163,16 +163,19 @@ def get_vocabulary( tokens_list = [unknown_token] for word in word_frequencies: - tokens_list.append(word[0]) - for token in word[0]: - if token not in tokens_list: - tokens_list.append(token) + for token in word: + tokens_list.append(token) + prep_word = prepare_word(token, None, None) + if prep_word is None: + return None + tokens_list += list(prep_word) - tokens_list.sort() - tokens_list.sort(key=len, reverse=True) + tokens_set = set(tokens_list) + alph_sorted = sorted(tokens_set) + tokens_sorted = sorted(alph_sorted, key=len, reverse=True) tokens_dict = {} - for index, token in enumerate(tokens_list): + for index, token in enumerate(tokens_sorted): tokens_dict[token] = index return tokens_dict @@ -189,7 +192,7 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list) and encoded_text is not None: + if not isinstance(encoded_text, list): return None if not isinstance(vocabulary, dict): return None @@ -199,12 +202,12 @@ def decode( decoded_text = '' inv_vocabulary = {v: k for k, v in vocabulary.items()} - if end_of_word_token is None and encoded_text is not None: + if end_of_word_token is None: for identifier in encoded_text: decoded_text += inv_vocabulary[identifier] return decoded_text - if end_of_word_token is not None and encoded_text is not None: + if end_of_word_token is not None: for identifier in encoded_text: if end_of_word_token in inv_vocabulary[identifier]: new_token = '' diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 43219f210..cdfcda219 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -15,22 +15,18 @@ def main() -> None: text = text_file.read() word_frequencies = main_py.collect_frequencies(text, None, '') + word_frequencies = main_py.train(word_frequencies, 100) + secret = [] vocabulary = {} - token_pairs = {} - num_merges = 0 - if isinstance(word_frequencies, dict): - token_pairs = main_py.count_tokens_pairs(word_frequencies) - if isinstance(token_pairs, dict): - num_merges = len(token_pairs) - word_frequencies = main_py.train(word_frequencies, num_merges) - if isinstance(word_frequencies, dict): + if word_frequencies is not None: + with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: + secret = secret_file.read() vocabulary = main_py.get_vocabulary(word_frequencies, '') + secret = [int(num) for num in secret.split(' ')] - with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: - secret = secret_file.read() - - result = main_py.decode(secret.split(' '), vocabulary, '') + result = main_py.decode(secret, vocabulary, '') + print(result) assert result, "Encoding is not working" From 30fa70feb22aefdf35de0f2797c104cd70f5e783 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 23:19:33 +0300 Subject: [PATCH 061/107] start.py fixing --- lab_2_tokenize_by_bpe/start.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index cdfcda219..cfde3c8d0 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -21,9 +21,10 @@ def main() -> None: vocabulary = {} if word_frequencies is not None: with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: - secret = secret_file.read() - vocabulary = main_py.get_vocabulary(word_frequencies, '') - secret = [int(num) for num in secret.split(' ')] + secret_str = secret_file.read() + if vocabulary is not None: + vocabulary = main_py.get_vocabulary(word_frequencies, '') + secret = [int(num) for num in secret_str.split(' ')] result = main_py.decode(secret, vocabulary, '') print(result) From c2c2aafdf337d06691a7864fdda7485edea0b9d5 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 2 Nov 2023 23:23:51 +0300 Subject: [PATCH 062/107] start.py fixing --- lab_2_tokenize_by_bpe/start.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index cfde3c8d0..0563e06db 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -18,13 +18,12 @@ def main() -> None: word_frequencies = main_py.train(word_frequencies, 100) secret = [] - vocabulary = {} + vocabulary = None if word_frequencies is not None: with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: secret_str = secret_file.read() - if vocabulary is not None: - vocabulary = main_py.get_vocabulary(word_frequencies, '') - secret = [int(num) for num in secret_str.split(' ')] + vocabulary = main_py.get_vocabulary(word_frequencies, '') + secret = [int(num) for num in secret_str.split(' ')] result = main_py.decode(secret, vocabulary, '') print(result) From 4ee29c10be6f9ed01a68060f8c53621dd134fd71 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 3 Nov 2023 12:02:07 +0300 Subject: [PATCH 063/107] merge_tokens fixing --- lab_2_tokenize_by_bpe/main.py | 17 +++++++++++------ lab_2_tokenize_by_bpe/start.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index d9026d1c3..22f0be633 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -98,14 +98,19 @@ def merge_tokens( new_word = [] if pair[0] in word and pair[1] in word: for index, token in enumerate(word): - if token == pair[1] and word[index - 1] == pair[0]: + if token == pair[0] and word[index + 1] == pair[1]: + new_word.append(pair[0] + pair[1]) + elif token == pair[1] and word[index - 1] == pair[0]: pass - elif token != pair[0]: - new_word.append(token) - elif token == pair[0] and word[index + 1] != pair[1]: + else: new_word.append(token) - elif token == pair[0] and word[index + 1] == pair[1]: - new_word.append(pair[0] + pair[1]) + + # elif token != pair[0]: + # new_word.append(token) + # elif token == pair[0] and word[index + 1] != pair[1]: + # new_word.append(token) + # elif token == pair[0] and word[index + 1] == pair[1]: + # new_word.append(pair[0] + pair[1]) value = word_freq_updated.pop(word) word_freq_updated[tuple(new_word)] = value diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 0563e06db..7fa2a1703 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -22,8 +22,8 @@ def main() -> None: if word_frequencies is not None: with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: secret_str = secret_file.read() - vocabulary = main_py.get_vocabulary(word_frequencies, '') secret = [int(num) for num in secret_str.split(' ')] + vocabulary = main_py.get_vocabulary(word_frequencies, '') result = main_py.decode(secret, vocabulary, '') print(result) From 9a8118dd83379cb92be4ee5118f6b49ee6dbf73d Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:13:41 +0300 Subject: [PATCH 064/107] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 298 +++++++++---------------- lab_2_tokenize_by_bpe/start.py | 50 ++--- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 125 insertions(+), 225 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 203a69a98..22f0be633 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -2,8 +2,6 @@ Lab 2 BPE and machine translation evaluation """ -import json -import math def prepare_word( @@ -16,16 +14,20 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if not isinstance(raw_word, str) or not (isinstance( - start_of_word, str) or start_of_word is None) or not ( - isinstance(end_of_word, str) or end_of_word is None): + if not isinstance(raw_word, str): return None - list_of_tokens = list(raw_word) - if end_of_word: - list_of_tokens.append(end_of_word) - if start_of_word: - list_of_tokens.insert(0, start_of_word) - return tuple(list_of_tokens) + if not isinstance(start_of_word, str) and start_of_word is not None: + return None + if not isinstance(end_of_word, str) and end_of_word is not None: + return None + + symbol_list = list(raw_word) + + if start_of_word is not None: + symbol_list.insert(0, start_of_word) + if end_of_word is not None: + symbol_list.append(end_of_word) + return tuple(symbol_list) def collect_frequencies( @@ -38,20 +40,21 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ - if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( - isinstance(start_of_word, str) or start_of_word is None): + if not isinstance(text, str) or not isinstance(end_of_word, str): + return None + if not isinstance(start_of_word, str) and start_of_word is not None: return None - dict_frequencies = {} - - splitted_text = text.split() - for i in set(splitted_text): - word = prepare_word(i, start_of_word, end_of_word) - if not word: + dict_freq = {} + words_list = text.split() + for word in words_list: + prepared_word = prepare_word(word, start_of_word, end_of_word) + if prepared_word is None: return None - dict_frequencies[word] = splitted_text.count(i) + if prepared_word not in dict_freq: + dict_freq[prepared_word] = words_list.count(word) - return dict_frequencies + return dict_freq def count_tokens_pairs( @@ -65,16 +68,16 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None - dict_with_pairs = {} + pair_dict = {} for word in word_frequencies: for index in range(len(word) - 1): pair = (word[index], word[index + 1]) - if pair not in dict_with_pairs: - dict_with_pairs[pair] = 0 - dict_with_pairs[pair] += word_frequencies[word] + if pair not in pair_dict: + pair_dict[pair] = 0 + pair_dict[pair] += word_frequencies[word] - return dict_with_pairs + return pair_dict def merge_tokens( @@ -88,22 +91,31 @@ def merge_tokens( """ if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): return None - dict_merged_tokens = {} - for i in word_frequencies: - list_word = list(i) - for index in range(len(list_word) - 1): - if (i[index], i[index + 1]) == pair: - list_word[index + 1] = pair[0] + pair[1] - list_word[index] = '' + word_freq_updated = word_frequencies.copy() + + for word in word_frequencies: + new_word = [] + if pair[0] in word and pair[1] in word: + for index, token in enumerate(word): + if token == pair[0] and word[index + 1] == pair[1]: + new_word.append(pair[0] + pair[1]) + elif token == pair[1] and word[index - 1] == pair[0]: + pass + else: + new_word.append(token) + + # elif token != pair[0]: + # new_word.append(token) + # elif token == pair[0] and word[index + 1] != pair[1]: + # new_word.append(token) + # elif token == pair[0] and word[index + 1] == pair[1]: + # new_word.append(pair[0] + pair[1]) - if '' in list_word: - list_word.remove('') - dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) - else: - dict_merged_tokens.update({i: word_frequencies[i]}) + value = word_freq_updated.pop(word) + word_freq_updated[tuple(new_word)] = value - return dict_merged_tokens + return word_freq_updated def train( @@ -117,30 +129,27 @@ def train( """ if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None - dict_with_pairs = count_tokens_pairs(word_frequencies) - if not dict_with_pairs: + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: return None - merges = min(num_merges, len(dict_with_pairs)) - for i in range(merges): + for _ in range(num_merges): + if pair_dict != {}: + max_value = max(pair_dict.values()) + pair_list = [key for key, value in pair_dict.items() if value == max_value] - max_values = max(dict_with_pairs.values()) - pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] + pair_list.sort() + pair_to_merge = max(pair_list, key=lambda x: len(x[0] + x[1])) - max_len = max(len(str(pair)) for pair in pairs_max_values) - pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] + if isinstance(word_frequencies, dict): + word_frequencies = merge_tokens(word_frequencies, pair_to_merge) - sorted_pairs = sorted(pairs_max_len) - word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) - - if not word_frequencies: - return None - - dict_with_pairs = count_tokens_pairs(word_frequencies) - - if not dict_with_pairs: - return None + pair_dict.pop(pair_to_merge) + if isinstance(word_frequencies, dict): + pair_dict = count_tokens_pairs(word_frequencies) + if pair_dict is None: + return None return word_frequencies @@ -157,23 +166,24 @@ def get_vocabulary( if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): return None - dict_ident = {} - unique_tokens = set() - - for tuple_tokens in word_frequencies.keys(): - for word in tuple_tokens: - unique_tokens.update(tuple_tokens, word) + tokens_list = [unknown_token] + for word in word_frequencies: + for token in word: + tokens_list.append(token) + prep_word = prepare_word(token, None, None) + if prep_word is None: + return None + tokens_list += list(prep_word) - unique_tokens.add(unknown_token) - lex_sorted = sorted(unique_tokens) - len_sorted = sorted(lex_sorted, key=len, reverse=True) - index = 0 + tokens_set = set(tokens_list) + alph_sorted = sorted(tokens_set) + tokens_sorted = sorted(alph_sorted, key=len, reverse=True) - for token in len_sorted: - dict_ident[token] = index - index += 1 + tokens_dict = {} + for index, token in enumerate(tokens_sorted): + tokens_dict[token] = index - return dict_ident + return tokens_dict def decode( @@ -187,20 +197,37 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( - end_of_word_token, str) or end_of_word_token is None): + if not isinstance(encoded_text, list): + return None + if not isinstance(vocabulary, dict): + return None + if not isinstance(end_of_word_token, str) and end_of_word_token is not None: return None - decoded = '' - for identifier in encoded_text: - token_list = [key for key in vocabulary if vocabulary[key] == identifier] - for token in token_list: - decoded += token + decoded_text = '' + inv_vocabulary = {v: k for k, v in vocabulary.items()} - if end_of_word_token: - decoded = decoded.replace(end_of_word_token, ' ') + if end_of_word_token is None: + for identifier in encoded_text: + decoded_text += inv_vocabulary[identifier] + return decoded_text - return decoded + if end_of_word_token is not None: + for identifier in encoded_text: + if end_of_word_token in inv_vocabulary[identifier]: + new_token = '' + for element in inv_vocabulary[identifier]: + if element not in end_of_word_token: + new_token += element + if element in end_of_word_token: + break + decoded_text += new_token + decoded_text += ' ' + elif end_of_word_token not in inv_vocabulary[identifier]: + decoded_text += inv_vocabulary[identifier] + return decoded_text + + return None def tokenize_word( @@ -215,27 +242,6 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( - end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): - return None - - word_copy = ''.join(word) - sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) - result = [] - - for key in sorted_vocabulary: - while key in word_copy: - index = word_copy.count(' ', 0, word_copy.find(key)) - result.insert(index, vocabulary[key]) - word_copy = word_copy.replace(key, ' ', 1) - - for unk in word_copy: - if unk != ' ': - index = word_copy.find(unk) - word_copy = word_copy.replace(unk, ' ') - result.insert(index, vocabulary[unknown_token]) - - return result def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -244,16 +250,6 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: :param vocab_path: path to the saved vocabulary :return: dictionary in the form of """ - if not isinstance(vocab_path, str): - return None - - with open(vocab_path, 'r', encoding='utf-8') as f: - vocab = json.load(f) - - if not isinstance(vocab, dict): - return None - - return vocab def encode( @@ -272,26 +268,6 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(original_text, str) or not isinstance( - vocabulary, dict) or not (isinstance( - start_of_word_token, str) or start_of_word_token is None) or not (isinstance( - end_of_word_token, str) or end_of_word_token is None) or not isinstance( - unknown_token, str): - return None - - encoded = [] - split_text = original_text.split() - - for word in split_text: - prepared = prepare_word(word, start_of_word_token, end_of_word_token) - if not prepared: - return None - result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) - if not result: - return None - encoded.extend(result) - - return encoded def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -301,14 +277,6 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: :param order: required number of elements in a single n-gram :return: sequence of n-grams """ - if not isinstance(text, str) or not isinstance(order, int): - return None - - n_grams = [] - for index in range(len(text) + 1 - order): - n_grams.append(tuple(text[index: index + order])) - - return n_grams def calculate_precision( @@ -320,17 +288,6 @@ def calculate_precision( :param reference: expected sequence of n-grams :return: value of Precision metric """ - if not isinstance(actual, list) or not isinstance(reference, list): - return None - - unique_ngrams = set(reference) - matches = 0 - - for n_gram in unique_ngrams: - if n_gram in actual: - matches += 1 - - return matches / len(unique_ngrams) def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -340,17 +297,6 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: :param max_order: maximum length of n-gram considered :return: value of geometric mean of Precision metric """ - if not isinstance(precisions, list) or not isinstance(max_order, int): - return None - - summation = float(0) - - for order in range(max_order): - if precisions[order] < 0: - return 0 - summation += math.log(precisions[order]) - - return math.exp(1 / max_order * summation) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -361,31 +307,3 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ - if not isinstance(actual, str) or not isinstance( - reference, str) or max_order != 3: - return None - - actual_ngrams = [] - reference_ngrams = [] - - for order in range(max_order): - actual_ngram = collect_ngrams(actual, order + 1) - reference_ngram = collect_ngrams(reference, order + 1) - if actual_ngram is None or reference_ngram is None: - return None - actual_ngrams.append(actual_ngram) - reference_ngrams.append(reference_ngram) - - precisions = [] - - for i, j in zip(actual_ngrams, reference_ngrams): - precision = calculate_precision(i, j) - if precision is None: - return None - precisions.append(precision) - - average = geo_mean(precisions, max_order) - if average is None: - return None - - return average * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index d71b1c9c4..7fa2a1703 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -1,11 +1,9 @@ """ BPE Tokenizer starter """ -import json from pathlib import Path -from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, - get_vocabulary, train) +import lab_2_tokenize_by_bpe.main as main_py def main() -> None: @@ -15,37 +13,21 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: - encoded_secret = text_file.read() - dict_frequencies = collect_frequencies(text, None, '') - merged_tokens = train(dict_frequencies, 100) - if merged_tokens: - vocabulary = get_vocabulary(merged_tokens, '') - secret = [int(num) for num in encoded_secret.split()] - result = decode(secret, vocabulary, '') - print(result) - assert result, "Encoding is not working" - - with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: - predicted = file.read() - with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: - vocabulary = json.load(file) - with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: - actual = file.read() - - if [int(token) for token in actual.split()] == encode( - predicted, vocabulary, '\u2581', None, ''): - print("Encoding is successful!") - - with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: - encoded_en = file.read() - with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: - decoded_en = file.read() - - decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) - decoded = decoded.replace('\u2581', ' ') - - print(calculate_bleu(decoded, decoded_en)) + + word_frequencies = main_py.collect_frequencies(text, None, '') + word_frequencies = main_py.train(word_frequencies, 100) + + secret = [] + vocabulary = None + if word_frequencies is not None: + with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: + secret_str = secret_file.read() + secret = [int(num) for num in secret_str.split(' ')] + vocabulary = main_py.get_vocabulary(word_frequencies, '') + + result = main_py.decode(secret, vocabulary, '') + print(result) + assert result, "Encoding is not working" if __name__ == "__main__": diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index f599e28b8..45a4fb75d 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -10 +8 From 4dcbcb136412db0d16d9df9e09fb727f95950ba7 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:23:31 +0300 Subject: [PATCH 065/107] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 328 +++++++++++++++---------- lab_2_tokenize_by_bpe/start.py | 50 ++-- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 3 files changed, 239 insertions(+), 141 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 22f0be633..19a72913f 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -2,10 +2,12 @@ Lab 2 BPE and machine translation evaluation """ +import json +import math def prepare_word( - raw_word: str, start_of_word: str | None, end_of_word: str | None + raw_word: str, start_of_word: str | None, end_of_word: str | None ) -> tuple[str, ...] | None: """ Tokenizes word into unigrams and appends end-of-word token @@ -14,24 +16,20 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if not isinstance(raw_word, str): + if not isinstance(raw_word, str) or not (isinstance( + start_of_word, str) or start_of_word is None) or not ( + isinstance(end_of_word, str) or end_of_word is None): return None - if not isinstance(start_of_word, str) and start_of_word is not None: - return None - if not isinstance(end_of_word, str) and end_of_word is not None: - return None - - symbol_list = list(raw_word) - - if start_of_word is not None: - symbol_list.insert(0, start_of_word) - if end_of_word is not None: - symbol_list.append(end_of_word) - return tuple(symbol_list) + list_of_tokens = list(raw_word) + if end_of_word: + list_of_tokens.append(end_of_word) + if start_of_word: + list_of_tokens.insert(0, start_of_word) + return tuple(list_of_tokens) def collect_frequencies( - text: str, start_of_word: str | None, end_of_word: str + text: str, start_of_word: str | None, end_of_word: str ) -> dict[tuple[str, ...], int] | None: """ Counts number of occurrences of each word @@ -40,25 +38,24 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ - if not isinstance(text, str) or not isinstance(end_of_word, str): - return None - if not isinstance(start_of_word, str) and start_of_word is not None: + if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( + isinstance(start_of_word, str) or start_of_word is None): return None - dict_freq = {} - words_list = text.split() - for word in words_list: - prepared_word = prepare_word(word, start_of_word, end_of_word) - if prepared_word is None: + dict_frequencies = {} + + splitted_text = text.split() + for i in set(splitted_text): + word = prepare_word(i, start_of_word, end_of_word) + if not word: return None - if prepared_word not in dict_freq: - dict_freq[prepared_word] = words_list.count(word) + dict_frequencies[word] = splitted_text.count(i) - return dict_freq + return dict_frequencies def count_tokens_pairs( - word_frequencies: dict[tuple[str, ...], int] + word_frequencies: dict[tuple[str, ...], int] ) -> dict[tuple[str, str], int] | None: """ Counts number of occurrences of each pair of subsequent tokens @@ -68,20 +65,20 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None - pair_dict = {} + dict_with_pairs = {} for word in word_frequencies: for index in range(len(word) - 1): pair = (word[index], word[index + 1]) - if pair not in pair_dict: - pair_dict[pair] = 0 - pair_dict[pair] += word_frequencies[word] + if pair not in dict_with_pairs: + dict_with_pairs[pair] = 0 + dict_with_pairs[pair] += word_frequencies[word] - return pair_dict + return dict_with_pairs def merge_tokens( - word_frequencies: dict[tuple[str, ...], int], pair: tuple[str, str] + word_frequencies: dict[tuple[str, ...], int], pair: tuple[str, str] ) -> dict[tuple[str, ...], int] | None: """ Updates word frequency dictionary by replacing a pair of token with a merged one @@ -91,35 +88,26 @@ def merge_tokens( """ if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): return None + dict_merged_tokens = {} + for i in word_frequencies: + list_word = list(i) - word_freq_updated = word_frequencies.copy() - - for word in word_frequencies: - new_word = [] - if pair[0] in word and pair[1] in word: - for index, token in enumerate(word): - if token == pair[0] and word[index + 1] == pair[1]: - new_word.append(pair[0] + pair[1]) - elif token == pair[1] and word[index - 1] == pair[0]: - pass - else: - new_word.append(token) - - # elif token != pair[0]: - # new_word.append(token) - # elif token == pair[0] and word[index + 1] != pair[1]: - # new_word.append(token) - # elif token == pair[0] and word[index + 1] == pair[1]: - # new_word.append(pair[0] + pair[1]) + for index in range(len(list_word) - 1): + if (i[index], i[index + 1]) == pair: + list_word[index + 1] = pair[0] + pair[1] + list_word[index] = '' - value = word_freq_updated.pop(word) - word_freq_updated[tuple(new_word)] = value + if '' in list_word: + list_word.remove('') + dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) + else: + dict_merged_tokens.update({i: word_frequencies[i]}) - return word_freq_updated + return dict_merged_tokens def train( - word_frequencies: dict[tuple[str, ...], int] | None, num_merges: int + word_frequencies: dict[tuple[str, ...], int] | None, num_merges: int ) -> dict[tuple[str, ...], int] | None: """ Creates required number of new tokens by merging existing ones @@ -129,33 +117,36 @@ def train( """ if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None + dict_with_pairs = count_tokens_pairs(word_frequencies) - pair_dict = count_tokens_pairs(word_frequencies) - if pair_dict is None: + if not dict_with_pairs: return None + merges = min(num_merges, len(dict_with_pairs)) - for _ in range(num_merges): - if pair_dict != {}: - max_value = max(pair_dict.values()) - pair_list = [key for key, value in pair_dict.items() if value == max_value] + for i in range(merges): - pair_list.sort() - pair_to_merge = max(pair_list, key=lambda x: len(x[0] + x[1])) + max_values = max(dict_with_pairs.values()) + pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] - if isinstance(word_frequencies, dict): - word_frequencies = merge_tokens(word_frequencies, pair_to_merge) + max_len = max(len(str(pair)) for pair in pairs_max_values) + pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] - pair_dict.pop(pair_to_merge) - if isinstance(word_frequencies, dict): - pair_dict = count_tokens_pairs(word_frequencies) - if pair_dict is None: - return None + sorted_pairs = sorted(pairs_max_len) + word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) + + if not word_frequencies: + return None + + dict_with_pairs = count_tokens_pairs(word_frequencies) + + if not dict_with_pairs: + return None return word_frequencies def get_vocabulary( - word_frequencies: dict[tuple[str, ...], int], unknown_token: str + word_frequencies: dict[tuple[str, ...], int], unknown_token: str ) -> dict[str, int] | None: """ Establishes correspondence between tokens and its integer identifier @@ -166,29 +157,27 @@ def get_vocabulary( if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): return None - tokens_list = [unknown_token] - for word in word_frequencies: - for token in word: - tokens_list.append(token) - prep_word = prepare_word(token, None, None) - if prep_word is None: - return None - tokens_list += list(prep_word) + dict_ident = {} + unique_tokens = set() + + for tuple_tokens in word_frequencies.keys(): + for word in tuple_tokens: + unique_tokens.update(tuple_tokens, word) - tokens_set = set(tokens_list) - alph_sorted = sorted(tokens_set) - tokens_sorted = sorted(alph_sorted, key=len, reverse=True) + unique_tokens.add(unknown_token) + lex_sorted = sorted(unique_tokens) + len_sorted = sorted(lex_sorted, key=len, reverse=True) + index = 0 - tokens_dict = {} - for index, token in enumerate(tokens_sorted): - tokens_dict[token] = index + for token in len_sorted: + dict_ident[token] = index + index += 1 - return tokens_dict + return dict_ident def decode( - encoded_text: list[int] | None, vocabulary: dict[str, int] | None - , end_of_word_token: str | None + encoded_text: list[int] | None, vocabulary: dict[str, int] | None, end_of_word_token: str | None ) -> str | None: """ Translates encoded sequence into decoded one @@ -197,42 +186,24 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ - if not isinstance(encoded_text, list): - return None - if not isinstance(vocabulary, dict): - return None - if not isinstance(end_of_word_token, str) and end_of_word_token is not None: + if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None): return None + decoded = '' + for identifier in encoded_text: + token_list = [key for key in vocabulary if vocabulary[key] == identifier] - decoded_text = '' - inv_vocabulary = {v: k for k, v in vocabulary.items()} + for token in token_list: + decoded += token - if end_of_word_token is None: - for identifier in encoded_text: - decoded_text += inv_vocabulary[identifier] - return decoded_text + if end_of_word_token: + decoded = decoded.replace(end_of_word_token, ' ') - if end_of_word_token is not None: - for identifier in encoded_text: - if end_of_word_token in inv_vocabulary[identifier]: - new_token = '' - for element in inv_vocabulary[identifier]: - if element not in end_of_word_token: - new_token += element - if element in end_of_word_token: - break - decoded_text += new_token - decoded_text += ' ' - elif end_of_word_token not in inv_vocabulary[identifier]: - decoded_text += inv_vocabulary[identifier] - return decoded_text - - return None + return decoded def tokenize_word( - word: tuple[str, ...], vocabulary: dict[str, int] - , end_of_word: str | None, unknown_token: str + word: tuple[str, ...], vocabulary: dict[str, int], end_of_word: str | None, unknown_token: str ) -> list[int] | None: """ Splits word into tokens @@ -242,6 +213,27 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): + return None + + word_copy = ''.join(word) + sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) + result = [] + + for key in sorted_vocabulary: + while key in word_copy: + index = word_copy.count(' ', 0, word_copy.find(key)) + result.insert(index, vocabulary[key]) + word_copy = word_copy.replace(key, ' ', 1) + + for unk in word_copy: + if unk != ' ': + index = word_copy.find(unk) + word_copy = word_copy.replace(unk, ' ') + result.insert(index, vocabulary[unknown_token]) + + return result def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -250,14 +242,24 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: :param vocab_path: path to the saved vocabulary :return: dictionary in the form of """ + if not isinstance(vocab_path, str): + return None + + with open(vocab_path, 'r', encoding='utf-8') as f: + vocab = json.load(f) + + if not isinstance(vocab, dict): + return None + + return vocab def encode( - original_text: str, - vocabulary: dict[str, int] | None, - start_of_word_token: str | None, - end_of_word_token: str | None, - unknown_token: str, + original_text: str, + vocabulary: dict[str, int] | None, + start_of_word_token: str | None, + end_of_word_token: str | None, + unknown_token: str, ) -> list[int] | None: """ Translates decoded sequence into encoded one @@ -268,6 +270,26 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if not isinstance(original_text, str) or not isinstance( + vocabulary, dict) or not (isinstance( + start_of_word_token, str) or start_of_word_token is None) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None) or not isinstance( + unknown_token, str): + return None + + encoded = [] + split_text = original_text.split() + + for word in split_text: + prepared = prepare_word(word, start_of_word_token, end_of_word_token) + if not prepared: + return None + result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) + if not result: + return None + encoded.extend(result) + + return encoded def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -277,10 +299,18 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: :param order: required number of elements in a single n-gram :return: sequence of n-grams """ + if not isinstance(text, str) or not isinstance(order, int): + return None + + n_grams = [] + for index in range(len(text) + 1 - order): + n_grams.append(tuple(text[index: index + order])) + + return n_grams def calculate_precision( - actual: list[tuple[str, ...]], reference: list[tuple[str, ...]] + actual: list[tuple[str, ...]], reference: list[tuple[str, ...]] ) -> float | None: """ Compares two sequences by virtue of Precision metric @@ -288,6 +318,17 @@ def calculate_precision( :param reference: expected sequence of n-grams :return: value of Precision metric """ + if not isinstance(actual, list) or not isinstance(reference, list): + return None + + unique_ngrams = set(reference) + matches = 0 + + for n_gram in unique_ngrams: + if n_gram in actual: + matches += 1 + + return matches / len(unique_ngrams) def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -297,6 +338,17 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: :param max_order: maximum length of n-gram considered :return: value of geometric mean of Precision metric """ + if not isinstance(precisions, list) or not isinstance(max_order, int): + return None + + summation = float(0) + + for order in range(max_order): + if precisions[order] < 0: + return 0 + summation += math.log(precisions[order]) + + return math.exp(1 / max_order * summation) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -307,3 +359,31 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ + if not isinstance(actual, str) or not isinstance( + reference, str) or max_order != 3: + return None + + actual_ngrams = [] + reference_ngrams = [] + + for order in range(max_order): + actual_ngram = collect_ngrams(actual, order + 1) + reference_ngram = collect_ngrams(reference, order + 1) + if actual_ngram is None or reference_ngram is None: + return None + actual_ngrams.append(actual_ngram) + reference_ngrams.append(reference_ngram) + + precisions = [] + + for i, j in zip(actual_ngrams, reference_ngrams): + precision = calculate_precision(i, j) + if precision is None: + return None + precisions.append(precision) + + average = geo_mean(precisions, max_order) + if average is None: + return None + + return average * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 7fa2a1703..d71b1c9c4 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -1,9 +1,11 @@ """ BPE Tokenizer starter """ +import json from pathlib import Path -import lab_2_tokenize_by_bpe.main as main_py +from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, + get_vocabulary, train) def main() -> None: @@ -13,21 +15,37 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - - word_frequencies = main_py.collect_frequencies(text, None, '') - word_frequencies = main_py.train(word_frequencies, 100) - - secret = [] - vocabulary = None - if word_frequencies is not None: - with open(assets_path / 'secrets' / 'secret_2.txt', 'r', encoding='utf-8') as secret_file: - secret_str = secret_file.read() - secret = [int(num) for num in secret_str.split(' ')] - vocabulary = main_py.get_vocabulary(word_frequencies, '') - - result = main_py.decode(secret, vocabulary, '') - print(result) - assert result, "Encoding is not working" + with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: + encoded_secret = text_file.read() + dict_frequencies = collect_frequencies(text, None, '') + merged_tokens = train(dict_frequencies, 100) + if merged_tokens: + vocabulary = get_vocabulary(merged_tokens, '') + secret = [int(num) for num in encoded_secret.split()] + result = decode(secret, vocabulary, '') + print(result) + assert result, "Encoding is not working" + + with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: + predicted = file.read() + with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: + vocabulary = json.load(file) + with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: + actual = file.read() + + if [int(token) for token in actual.split()] == encode( + predicted, vocabulary, '\u2581', None, ''): + print("Encoding is successful!") + + with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: + encoded_en = file.read() + with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: + decoded_en = file.read() + + decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) + decoded = decoded.replace('\u2581', ' ') + + print(calculate_bleu(decoded, decoded_en)) if __name__ == "__main__": diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 45a4fb75d..f599e28b8 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -8 +10 From c75223c4f7df0826fb3249d9b62738fd4185ce32 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 8 Nov 2023 10:18:53 +0300 Subject: [PATCH 066/107] step 1 done --- lab_3_generate_by_ngrams/main.py | 124 +++++++++++++++++++--- lab_3_generate_by_ngrams/start.py | 6 +- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 116 insertions(+), 16 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index dcf4e8af9..840d16eb4 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -23,6 +23,8 @@ def __init__(self, end_of_word_token: str) -> None: Args: end_of_word_token (str): A token denoting word boundary """ + self.end_of_word_token = end_of_word_token + self._storage = {} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ @@ -41,6 +43,27 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str): + return None + + tokenized_text = [] + for index, element in enumerate(text.lower()): + if element.isalpha(): + tokenized_text.append(element) + elif element.isdigit(): + pass + else: + if index == 0: + pass + elif tokenized_text[-1] != self.end_of_word_token: + tokenized_text.append(self.end_of_word_token) + else: + pass + + if len(tokenized_text) == 0: + return None + + return tuple(tokenized_text) def get_id(self, element: str) -> Optional[int]: """ @@ -55,6 +78,10 @@ def get_id(self, element: str) -> Optional[int]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element, str) or element not in self._storage: + return None + + return self._storage[element] def get_end_of_word_token(self) -> str: """ @@ -63,6 +90,7 @@ def get_end_of_word_token(self) -> str: Returns: str: EoW token """ + return self.end_of_word_token def get_token(self, element_id: int) -> Optional[str]: """ @@ -76,6 +104,12 @@ def get_token(self, element_id: int) -> Optional[str]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + inv_storage = {identifier: element for element, identifier in self._storage.items()} + + if not isinstance(element_id, int) or element_id not in inv_storage: + return None + + return inv_storage[element_id] def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -93,6 +127,23 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str) or len(text) == 0: + return None + + tokenized_text = self._tokenize(text) + if tokenized_text is None: + return None + + for token in tokenized_text: + self._put(token) + + processed_text = [] + for token in tokenized_text: + if self.get_id(token) is None: + return None + processed_text.append(self.get_id(token)) + + return tuple(processed_text) def _put(self, element: str) -> None: """ @@ -104,6 +155,11 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ + if self.end_of_word_token not in self._storage: + self._storage[self.end_of_word_token] = 0 + + if isinstance(element, str) and len(element) == 1 and element not in self._storage: + self._storage[element] = len(self._storage) def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -121,6 +177,18 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + return None + + decoded_corpus = self._decode(encoded_corpus) + if decoded_corpus is None: + return None + + postpocessed_text = self._postprocess_decoded_text(decoded_corpus) + if postpocessed_text is None: + return None + + return postpocessed_text def fill_from_ngrams(self, content: dict) -> None: """ @@ -143,6 +211,18 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(corpus, tuple) or len(corpus) == 0: + return None + + decoded_corpus = [] + for identifier in corpus: + if self.get_token(identifier) is not None: + decoded_corpus.append(self.get_token(identifier)) + + if decoded_corpus is None or len(decoded_corpus) == 0: + return None + + return tuple(decoded_corpus) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: """ @@ -159,6 +239,22 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional In case of corrupt input arguments, None is returned """ + if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + return None + + postprocessed_text = '' + for index, token in enumerate(decoded_corpus): + if index == 0: + postprocessed_text += token.upper() + elif token == self.end_of_word_token: + if index == len(decoded_corpus) - 1: + postprocessed_text += '.' + else: + postprocessed_text += ' ' + else: + postprocessed_text += token + + return postprocessed_text class NGramLanguageModel: @@ -223,7 +319,7 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: """ def _extract_n_grams( - self, encoded_corpus: tuple[int, ...] + self, encoded_corpus: tuple[int, ...] ) -> Optional[tuple[tuple[int, ...], ...]]: """ Split encoded sequence into n-grams. @@ -311,10 +407,10 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, """ def continue_sequence( - self, - sequence: tuple[int, ...], - next_tokens: list[tuple[int, float]], - sequence_candidates: dict[tuple[int, ...], float], + self, + sequence: tuple[int, ...], + next_tokens: list[tuple[int, float]], + sequence_candidates: dict[tuple[int, ...], float], ) -> Optional[dict[tuple[int, ...], float]]: """ Generate new sequences from the base sequence with next tokens provided. @@ -333,7 +429,7 @@ def continue_sequence( """ def prune_sequence_candidates( - self, sequence_candidates: dict[tuple[int, ...], float] + self, sequence_candidates: dict[tuple[int, ...], float] ) -> Optional[dict[tuple[int, ...], float]]: """ Remove those sequence candidates that do not make top-N most probable sequences. @@ -360,10 +456,10 @@ class BeamSearchTextGenerator: """ def __init__( - self, - language_model: NGramLanguageModel, - text_processor: TextProcessor, - beam_width: int, + self, + language_model: NGramLanguageModel, + text_processor: TextProcessor, + beam_width: int, ): """ Initializes an instance of BeamSearchTextGenerator. @@ -390,7 +486,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: """ def _get_next_token( - self, sequence_to_continue: tuple[int, ...] + self, sequence_to_continue: tuple[int, ...] ) -> Optional[list[tuple[int, float]]]: """ Retrieve next tokens for sequence continuation. @@ -460,9 +556,9 @@ class BackOffGenerator: """ def __init__( - self, - language_models: tuple[NGramLanguageModel, ...], - text_processor: TextProcessor, + self, + language_models: tuple[NGramLanguageModel, ...], + text_processor: TextProcessor, ): """ Initializes an instance of BackOffGenerator. diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index b9bcbd999..62b8236bf 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,6 +1,7 @@ """ Generation by NGrams starter """ +import lab_3_generate_by_ngrams.main as main_py def main() -> None: @@ -11,7 +12,10 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None + text_processor = main_py.TextProcessor('_') + encoded_corpus = text_processor.encode(text) + decoded_text = text_processor.decode(encoded_corpus) + result = decoded_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 573541ac9..b8626c4cf 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +4 From 85e30a1697e7652fa8bbfb405b327d505c5e2193 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 9 Nov 2023 10:59:10 +0300 Subject: [PATCH 067/107] step 2 done --- lab_3_generate_by_ngrams/main.py | 49 +++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 840d16eb4..bbf7bc489 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -275,6 +275,9 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None: encoded_corpus (tuple): Encoded text n_gram_size (int): A size of n-grams to use for language modelling """ + self._encoded_corpus = encoded_corpus + self._n_gram_size = n_gram_size + self._n_gram_frequencies = {} def get_n_gram_size(self) -> int: """ @@ -283,6 +286,7 @@ def get_n_gram_size(self) -> int: Returns: int: Size of stored n_grams """ + return self._n_gram_size def set_n_grams(self, frequencies: dict) -> None: """ @@ -304,6 +308,25 @@ def build(self) -> int: In case of corrupt input arguments or methods used return None, 1 is returned """ + if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0: + return 1 + + n_grams = self._extract_n_grams(self._encoded_corpus) + if n_grams is None: + return 1 + + for n_gram in n_grams: + abs_freq = n_grams.count(n_gram) + rel_freq = 0 + for n_gram_to_compare in n_grams: + if n_gram_to_compare[:self._n_gram_size - 1] == n_gram[:self._n_gram_size - 1]: + rel_freq += 1 + self._n_gram_frequencies[n_gram] = abs_freq / rel_freq + + if len(self._n_gram_frequencies) == 0: + return 1 + + return 0 def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: """ @@ -317,9 +340,20 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ + if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1: + return None + + possible_tokens = {} + + context = sequence[-(self._n_gram_size - 1)::] + for n_gram in self._n_gram_frequencies: + if n_gram[:self._n_gram_size - 1] == context: + possible_tokens[n_gram[-1]] = self._n_gram_frequencies[n_gram] + + return possible_tokens def _extract_n_grams( - self, encoded_corpus: tuple[int, ...] + self, encoded_corpus: tuple[int, ...] ) -> Optional[tuple[tuple[int, ...], ...]]: """ Split encoded sequence into n-grams. @@ -332,6 +366,19 @@ def _extract_n_grams( In case of corrupt input arguments, None is returned """ + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + return None + + n_grams_list = [] + for index, number in enumerate(encoded_corpus): + if index == len(encoded_corpus) - 1: + break + n_gram = [number] + for i in range(1, self._n_gram_size): + n_gram.append(encoded_corpus[index + i]) + n_grams_list.append(tuple(n_gram)) + + return tuple(n_grams_list) class GreedyTextGenerator: From fe10f81f0155a697bfa7827872759db4d952f7bb Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 9 Nov 2023 12:40:28 +0300 Subject: [PATCH 068/107] step 3 done --- lab_3_generate_by_ngrams/main.py | 33 +++++++++++++++++++++-- lab_3_generate_by_ngrams/start.py | 7 ++++- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index bbf7bc489..a94111ff4 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -23,7 +23,7 @@ def __init__(self, end_of_word_token: str) -> None: Args: end_of_word_token (str): A token denoting word boundary """ - self.end_of_word_token = end_of_word_token + self._end_of_word_token = end_of_word_token self._storage = {} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: @@ -371,7 +371,7 @@ def _extract_n_grams( n_grams_list = [] for index, number in enumerate(encoded_corpus): - if index == len(encoded_corpus) - 1: + if index == len(encoded_corpus) - (self._n_gram_size - 1): break n_gram = [number] for i in range(1, self._n_gram_size): @@ -398,6 +398,8 @@ def __init__(self, language_model: NGramLanguageModel, text_processor: TextProce language_model (NGramLanguageModel): A language model to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ + self._model = language_model + self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: """ @@ -413,6 +415,33 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0: + return None + + encoded_text_tuple = self._text_processor.encode(prompt) + if encoded_text_tuple is None: + return None + # context_len = self._model.get_n_gram_size() - 1 + encoded_text_list = list(encoded_text_tuple) + + for i in range(seq_len): + possible_tokens = self._model.generate_next_token(encoded_text_tuple) + if possible_tokens is None: + return prompt + '.' + if len(possible_tokens) == 0: + break + # max_value = sorted(list(possible_tokens.values()), reverse=True)[0] + # max_freq_tokens = [(key, value) for key, value in possible_tokens.items() if value == max_value] + possible_tokens_list = list(possible_tokens.items()) + possible_tokens_list.sort(key=lambda x: x[0], reverse=True) + possible_tokens_list.sort(key=lambda x: x[1], reverse=True) + + encoded_text_list.append(possible_tokens_list[0][0]) + encoded_text_tuple = tuple(encoded_text_list) + + decoded_text = self._text_processor.decode(encoded_text_tuple) + '.' + + return decoded_text class BeamSearcher: diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 62b8236bf..105c4359b 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -15,7 +15,12 @@ def main() -> None: text_processor = main_py.TextProcessor('_') encoded_corpus = text_processor.encode(text) decoded_text = text_processor.decode(encoded_corpus) - result = decoded_text + + language_model = main_py.NGramLanguageModel(encoded_corpus, 7) + greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + generated_text = greedy_generator.run(51, 'Vernon') + + result = generated_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index b8626c4cf..1e8b31496 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -4 +6 From 60f02073ff703a1220631230353a4a64c12b2c8e Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 9 Nov 2023 12:43:59 +0300 Subject: [PATCH 069/107] TextProcessor arg _end_of_word_token fixed --- lab_3_generate_by_ngrams/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index a94111ff4..c637628f3 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -90,7 +90,7 @@ def get_end_of_word_token(self) -> str: Returns: str: EoW token """ - return self.end_of_word_token + return self._end_of_word_token def get_token(self, element_id: int) -> Optional[str]: """ @@ -155,8 +155,8 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ - if self.end_of_word_token not in self._storage: - self._storage[self.end_of_word_token] = 0 + if self._end_of_word_token not in self._storage: + self._storage[self._end_of_word_token] = 0 if isinstance(element, str) and len(element) == 1 and element not in self._storage: self._storage[element] = len(self._storage) @@ -246,7 +246,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional for index, token in enumerate(decoded_corpus): if index == 0: postprocessed_text += token.upper() - elif token == self.end_of_word_token: + elif token == self._end_of_word_token: if index == len(decoded_corpus) - 1: postprocessed_text += '.' else: From fe72367d701de8dc4f60a4386a04f61caf266df0 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 9 Nov 2023 12:46:10 +0300 Subject: [PATCH 070/107] TextProcessor arg _end_of_word_token fixed --- lab_3_generate_by_ngrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index c637628f3..d030f9121 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -55,8 +55,8 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: else: if index == 0: pass - elif tokenized_text[-1] != self.end_of_word_token: - tokenized_text.append(self.end_of_word_token) + elif tokenized_text[-1] != self._end_of_word_token: + tokenized_text.append(self._end_of_word_token) else: pass From 2df756017ceac412b1e0e3dc913f5ade702da66a Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 9 Nov 2023 13:00:49 +0300 Subject: [PATCH 071/107] TextProcessor arg _storage fixed --- lab_3_generate_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index d030f9121..7e6918718 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -24,7 +24,7 @@ def __init__(self, end_of_word_token: str) -> None: end_of_word_token (str): A token denoting word boundary """ self._end_of_word_token = end_of_word_token - self._storage = {} + self._storage = {self._end_of_word_token: 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ From 6d3833f384b740af6e8303d8789c506e575b5106 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 17 Nov 2023 11:02:36 +0300 Subject: [PATCH 072/107] commit --- lab_3_generate_by_ngrams/main.py | 95 +++++++++++++++++++------------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 7e6918718..d075cf5e7 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -57,10 +57,8 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: pass elif tokenized_text[-1] != self._end_of_word_token: tokenized_text.append(self._end_of_word_token) - else: - pass - if len(tokenized_text) == 0: + if not tokenized_text: return None return tuple(tokenized_text) @@ -104,9 +102,12 @@ def get_token(self, element_id: int) -> Optional[str]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element_id, int): + return None + inv_storage = {identifier: element for element, identifier in self._storage.items()} - if not isinstance(element_id, int) or element_id not in inv_storage: + if element_id not in inv_storage: return None return inv_storage[element_id] @@ -127,7 +128,7 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or len(text) == 0: + if not isinstance(text, str) or not len(text): return None tokenized_text = self._tokenize(text) @@ -155,9 +156,6 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ - if self._end_of_word_token not in self._storage: - self._storage[self._end_of_word_token] = 0 - if isinstance(element, str) and len(element) == 1 and element not in self._storage: self._storage[element] = len(self._storage) @@ -177,7 +175,7 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + if not isinstance(encoded_corpus, tuple) or not encoded_corpus: return None decoded_corpus = self._decode(encoded_corpus) @@ -211,7 +209,7 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(corpus, tuple) or len(corpus) == 0: + if not isinstance(corpus, tuple) or not corpus: return None decoded_corpus = [] @@ -219,7 +217,7 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: if self.get_token(identifier) is not None: decoded_corpus.append(self.get_token(identifier)) - if decoded_corpus is None or len(decoded_corpus) == 0: + if decoded_corpus is None or not decoded_corpus: return None return tuple(decoded_corpus) @@ -239,7 +237,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional In case of corrupt input arguments, None is returned """ - if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + if not isinstance(decoded_corpus, tuple) or not decoded_corpus: return None postprocessed_text = '' @@ -253,6 +251,8 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional postprocessed_text += ' ' else: postprocessed_text += token + if postprocessed_text[-1] != '.': + postprocessed_text += '.' return postprocessed_text @@ -308,7 +308,7 @@ def build(self) -> int: In case of corrupt input arguments or methods used return None, 1 is returned """ - if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0: + if not isinstance(self._encoded_corpus, tuple) or not self._encoded_corpus: return 1 n_grams = self._extract_n_grams(self._encoded_corpus) @@ -323,7 +323,7 @@ def build(self) -> int: rel_freq += 1 self._n_gram_frequencies[n_gram] = abs_freq / rel_freq - if len(self._n_gram_frequencies) == 0: + if not self._n_gram_frequencies: return 1 return 0 @@ -340,7 +340,7 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ - if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1: + if not isinstance(sequence, tuple) or not sequence or len(sequence) < self._n_gram_size - 1: return None possible_tokens = {} @@ -366,16 +366,15 @@ def _extract_n_grams( In case of corrupt input arguments, None is returned """ - if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + if not isinstance(encoded_corpus, tuple) or not encoded_corpus: return None n_grams_list = [] for index, number in enumerate(encoded_corpus): if index == len(encoded_corpus) - (self._n_gram_size - 1): break - n_gram = [number] - for i in range(1, self._n_gram_size): - n_gram.append(encoded_corpus[index + i]) + n_gram = [] + n_gram.extend(encoded_corpus[index:index + self._n_gram_size]) n_grams_list.append(tuple(n_gram)) return tuple(n_grams_list) @@ -415,31 +414,27 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0: + if not isinstance(seq_len, int) or not isinstance(prompt, str) or not prompt: return None encoded_text_tuple = self._text_processor.encode(prompt) if encoded_text_tuple is None: return None - # context_len = self._model.get_n_gram_size() - 1 encoded_text_list = list(encoded_text_tuple) for i in range(seq_len): possible_tokens = self._model.generate_next_token(encoded_text_tuple) if possible_tokens is None: return prompt + '.' - if len(possible_tokens) == 0: + if not possible_tokens: break - # max_value = sorted(list(possible_tokens.values()), reverse=True)[0] - # max_freq_tokens = [(key, value) for key, value in possible_tokens.items() if value == max_value] possible_tokens_list = list(possible_tokens.items()) - possible_tokens_list.sort(key=lambda x: x[0], reverse=True) possible_tokens_list.sort(key=lambda x: x[1], reverse=True) encoded_text_list.append(possible_tokens_list[0][0]) encoded_text_tuple = tuple(encoded_text_list) - decoded_text = self._text_processor.decode(encoded_text_tuple) + '.' + decoded_text = self._text_processor.decode(encoded_text_tuple) return decoded_text @@ -461,6 +456,8 @@ def __init__(self, beam_width: int, language_model: NGramLanguageModel) -> None: beam_width (int): Number of candidates to consider at each step language_model (NGramLanguageModel): A language model to use for next token prediction """ + self._beam_width = beam_width + self._model = language_model def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, float]]]: """ @@ -481,12 +478,31 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, In case of corrupt input arguments or methods used return None. """ + if not isinstance(sequence, tuple) or not sequence: + return None + + possible_tokens = self._model.generate_next_token(sequence) + if possible_tokens is None: + return None + if not possible_tokens: + return [] + + possible_tokens_list = list(possible_tokens.items()) + possible_tokens_list.sort(key=lambda x: x[0], reverse=True) + possible_tokens_list.sort(key=lambda x: x[1], reverse=True) + + best_tokens = [] + for _ in range(self._beam_width): + best_tokens.append(possible_tokens_list[0]) + possible_tokens_list.pop(0) + + return best_tokens def continue_sequence( - self, - sequence: tuple[int, ...], - next_tokens: list[tuple[int, float]], - sequence_candidates: dict[tuple[int, ...], float], + self, + sequence: tuple[int, ...], + next_tokens: list[tuple[int, float]], + sequence_candidates: dict[tuple[int, ...], float], ) -> Optional[dict[tuple[int, ...], float]]: """ Generate new sequences from the base sequence with next tokens provided. @@ -504,8 +520,9 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ + def prune_sequence_candidates( - self, sequence_candidates: dict[tuple[int, ...], float] + self, sequence_candidates: dict[tuple[int, ...], float] ) -> Optional[dict[tuple[int, ...], float]]: """ Remove those sequence candidates that do not make top-N most probable sequences. @@ -532,10 +549,10 @@ class BeamSearchTextGenerator: """ def __init__( - self, - language_model: NGramLanguageModel, - text_processor: TextProcessor, - beam_width: int, + self, + language_model: NGramLanguageModel, + text_processor: TextProcessor, + beam_width: int, ): """ Initializes an instance of BeamSearchTextGenerator. @@ -562,7 +579,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: """ def _get_next_token( - self, sequence_to_continue: tuple[int, ...] + self, sequence_to_continue: tuple[int, ...] ) -> Optional[list[tuple[int, float]]]: """ Retrieve next tokens for sequence continuation. @@ -632,9 +649,9 @@ class BackOffGenerator: """ def __init__( - self, - language_models: tuple[NGramLanguageModel, ...], - text_processor: TextProcessor, + self, + language_models: tuple[NGramLanguageModel, ...], + text_processor: TextProcessor, ): """ Initializes an instance of BackOffGenerator. From 7ef974ae154468eb7e0f6a8e5624ac4e19fdfcb7 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 21 Nov 2023 07:20:33 +0300 Subject: [PATCH 073/107] step 4.3 done --- lab_3_generate_by_ngrams/main.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index d075cf5e7..6076c38f0 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -5,6 +5,7 @@ """ # pylint:disable=too-few-public-methods from typing import Optional +import math class TextProcessor: @@ -519,7 +520,29 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ + if not isinstance(sequence, tuple) or not isinstance(next_tokens, list) \ + or not isinstance(sequence_candidates, dict): + return None + if not sequence or not next_tokens or not sequence_candidates: + return None + if sequence not in sequence_candidates: + return None + + copy_seq_candidates = sequence_candidates.copy() + list_sequence = list(sequence) + + for token in next_tokens: + list_sequence.append(token[0]) + possible_seq = tuple(list_sequence) + freq = sequence_candidates[sequence] - math.log(token[-1]) + copy_seq_candidates[possible_seq] = freq + copy_seq_candidates.pop(sequence) + + if len(copy_seq_candidates) > self._beam_width: + return None + + return copy_seq_candidates def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] @@ -535,6 +558,8 @@ def prune_sequence_candidates( In case of corrupt input arguments return None. """ + if not isinstance(sequence_candidates, dict) or not sequence_candidates: + return None class BeamSearchTextGenerator: From 8def004773913f60dda5374449bba242b9bc99e0 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 23 Nov 2023 01:44:16 +0300 Subject: [PATCH 074/107] step 5.2 done --- lab_3_generate_by_ngrams/main.py | 63 +++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 6076c38f0..0a78fe90b 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -6,6 +6,7 @@ # pylint:disable=too-few-public-methods from typing import Optional import math +import operator class TextProcessor: @@ -54,7 +55,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: elif element.isdigit(): pass else: - if index == 0: + if not index: pass elif tokenized_text[-1] != self._end_of_word_token: tokenized_text.append(self._end_of_word_token) @@ -243,7 +244,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional postprocessed_text = '' for index, token in enumerate(decoded_corpus): - if index == 0: + if not index: postprocessed_text += token.upper() elif token == self._end_of_word_token: if index == len(decoded_corpus) - 1: @@ -430,7 +431,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: if not possible_tokens: break possible_tokens_list = list(possible_tokens.items()) - possible_tokens_list.sort(key=lambda x: x[1], reverse=True) + possible_tokens_list.sort(key=operator.itemgetter(1,0), reverse=True) encoded_text_list.append(possible_tokens_list[0][0]) encoded_text_tuple = tuple(encoded_text_list) @@ -489,13 +490,8 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, return [] possible_tokens_list = list(possible_tokens.items()) - possible_tokens_list.sort(key=lambda x: x[0], reverse=True) - possible_tokens_list.sort(key=lambda x: x[1], reverse=True) - - best_tokens = [] - for _ in range(self._beam_width): - best_tokens.append(possible_tokens_list[0]) - possible_tokens_list.pop(0) + sorted_tokens_list = sorted(possible_tokens_list, key=operator.itemgetter(1,0), reverse=True) + best_tokens = sorted_tokens_list[:self._beam_width] return best_tokens @@ -561,6 +557,13 @@ def prune_sequence_candidates( if not isinstance(sequence_candidates, dict) or not sequence_candidates: return None + sorted_sequences = sorted(tuple(sequence_candidates.items()), key=operator.itemgetter(0,1), reverse=True) + result = {} + for sequence in sorted_sequences[:self._beam_width]: + result[sequence[0]] = sequence[1] + + return result + class BeamSearchTextGenerator: """ @@ -587,6 +590,10 @@ def __init__( text_processor (TextProcessor): A TextProcessor instance to handle text processing beam_width (int): Beam width parameter for generation """ + self._text_processor = text_processor + self._beam_width = beam_width + self._language_model = language_model + self.beam_searcher = BeamSearcher(self._beam_width, self._language_model) def run(self, prompt: str, seq_len: int) -> Optional[str]: """ @@ -602,6 +609,34 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if not isinstance(seq_len, int) or seq_len <= 0 or not isinstance(prompt, str) or not prompt: + return None + + encoded_prompt = self._text_processor.encode(prompt) + if encoded_prompt is None: + return None + seq_candidates = {encoded_prompt: 0.0} + + for _ in range(seq_len): + for sequence in seq_candidates: + next_tokens = self._get_next_token(sequence) + if not next_tokens: + return None + + seq_candidates = self.beam_searcher.continue_sequence(encoded_prompt, next_tokens, seq_candidates) + if not seq_candidates: + return None + + best_sequences = self.beam_searcher.prune_sequence_candidates(seq_candidates) + if not best_sequences: + return None + + seq_candidates = best_sequences + + sorted_candidates = sorted(seq_candidates, key=operator.itemgetter(0, 1), reverse=True) + decoded_sequence = self._text_processor.decode(sorted_candidates[0]) + + return decoded_sequence def _get_next_token( self, sequence_to_continue: tuple[int, ...] @@ -618,6 +653,14 @@ def _get_next_token( In case of corrupt input arguments return None. """ + if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue: + return None + + next_token = self.beam_searcher.get_next_token(sequence_to_continue) + if next_token is None: + return None + + return next_token class NGramLanguageModelReader: From 55816fca5c14c78743e3bb57688052750808b0b7 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 23 Nov 2023 12:49:57 +0300 Subject: [PATCH 075/107] beamsearchtextgenerator run() --- lab_3_generate_by_ngrams/main.py | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 0a78fe90b..8583bdbf5 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -523,6 +523,8 @@ def continue_sequence( return None if sequence not in sequence_candidates: return None + if len(next_tokens) > self._beam_width: + return None copy_seq_candidates = sequence_candidates.copy() list_sequence = list(sequence) @@ -532,12 +534,10 @@ def continue_sequence( possible_seq = tuple(list_sequence) freq = sequence_candidates[sequence] - math.log(token[-1]) copy_seq_candidates[possible_seq] = freq + list_sequence.remove(token[0]) copy_seq_candidates.pop(sequence) - if len(copy_seq_candidates) > self._beam_width: - return None - return copy_seq_candidates def prune_sequence_candidates( @@ -557,7 +557,7 @@ def prune_sequence_candidates( if not isinstance(sequence_candidates, dict) or not sequence_candidates: return None - sorted_sequences = sorted(tuple(sequence_candidates.items()), key=operator.itemgetter(0,1), reverse=True) + sorted_sequences = sorted(sequence_candidates.items(), key=operator.itemgetter(1, 0)) result = {} for sequence in sorted_sequences[:self._beam_width]: result[sequence[0]] = sequence[1] @@ -615,28 +615,28 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: encoded_prompt = self._text_processor.encode(prompt) if encoded_prompt is None: return None - seq_candidates = {encoded_prompt: 0.0} + candidates = {encoded_prompt: 0.0} for _ in range(seq_len): - for sequence in seq_candidates: + updated_candidates = candidates.copy() + + for sequence in candidates: next_tokens = self._get_next_token(sequence) if not next_tokens: return None - seq_candidates = self.beam_searcher.continue_sequence(encoded_prompt, next_tokens, seq_candidates) - if not seq_candidates: - return None - - best_sequences = self.beam_searcher.prune_sequence_candidates(seq_candidates) - if not best_sequences: - return None + possible_sequences = self.beam_searcher.continue_sequence(sequence, next_tokens, updated_candidates) + if not possible_sequences: + return self._text_processor.decode(sorted(tuple( + candidates), key=lambda x: x[1])[0]) - seq_candidates = best_sequences + sorted_candidates = self.beam_searcher.prune_sequence_candidates(updated_candidates) + if not sorted_candidates: + return None - sorted_candidates = sorted(seq_candidates, key=operator.itemgetter(0, 1), reverse=True) - decoded_sequence = self._text_processor.decode(sorted_candidates[0]) + candidates = sorted_candidates - return decoded_sequence + return self._text_processor.decode(sorted(tuple(candidates), key=lambda x: x[1])[0]) def _get_next_token( self, sequence_to_continue: tuple[int, ...] From 35ff145d4d22a1def21cd44af1095e26a2d270f2 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Wed, 29 Nov 2023 13:58:24 +0300 Subject: [PATCH 076/107] BeamSearchTextGenerator fixed --- lab_3_generate_by_ngrams/main.py | 37 +++++++++++++++-------- lab_3_generate_by_ngrams/start.py | 4 ++- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 8583bdbf5..371082fae 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -347,10 +347,13 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: possible_tokens = {} - context = sequence[-(self._n_gram_size - 1)::] - for n_gram in self._n_gram_frequencies: + context = sequence[-(self._n_gram_size - 1):] + + sort_data = dict(sorted(self._n_gram_frequencies.items(), key=lambda x: (x[1], list(x[0])))) + + for n_gram, freq in sort_data.items(): if n_gram[:self._n_gram_size - 1] == context: - possible_tokens[n_gram[-1]] = self._n_gram_frequencies[n_gram] + possible_tokens[n_gram[-1]] = freq return possible_tokens @@ -427,7 +430,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: for i in range(seq_len): possible_tokens = self._model.generate_next_token(encoded_text_tuple) if possible_tokens is None: - return prompt + '.' + return None if not possible_tokens: break possible_tokens_list = list(possible_tokens.items()) @@ -534,7 +537,7 @@ def continue_sequence( possible_seq = tuple(list_sequence) freq = sequence_candidates[sequence] - math.log(token[-1]) copy_seq_candidates[possible_seq] = freq - list_sequence.remove(token[0]) + list_sequence = list_sequence[:-1] copy_seq_candidates.pop(sequence) @@ -617,8 +620,11 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: return None candidates = {encoded_prompt: 0.0} + updated_candidates = candidates.copy() + for _ in range(seq_len): - updated_candidates = candidates.copy() + candidates = updated_candidates + not_sorted_candidates = {} for sequence in candidates: next_tokens = self._get_next_token(sequence) @@ -627,16 +633,21 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: possible_sequences = self.beam_searcher.continue_sequence(sequence, next_tokens, updated_candidates) if not possible_sequences: - return self._text_processor.decode(sorted(tuple( - candidates), key=lambda x: x[1])[0]) + return self._text_processor.decode(sorted(tuple(updated_candidates), key=lambda x: x[1])[0]) - sorted_candidates = self.beam_searcher.prune_sequence_candidates(updated_candidates) - if not sorted_candidates: - return None + not_sorted_candidates.update(possible_sequences) + + for candidate in candidates: + if candidate in not_sorted_candidates: + del not_sorted_candidates[candidate] + + sorted_candidates = self.beam_searcher.prune_sequence_candidates(not_sorted_candidates) + if not sorted_candidates: + return None - candidates = sorted_candidates + updated_candidates = sorted_candidates - return self._text_processor.decode(sorted(tuple(candidates), key=lambda x: x[1])[0]) + return self._text_processor.decode(sorted(tuple(updated_candidates), key=lambda x: x[1])[0]) def _get_next_token( self, sequence_to_continue: tuple[int, ...] diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 105c4359b..b8ff775c6 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -19,8 +19,10 @@ def main() -> None: language_model = main_py.NGramLanguageModel(encoded_corpus, 7) greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) generated_text = greedy_generator.run(51, 'Vernon') + beam_search_generator = main_py.BeamSearchTextGenerator(language_model,text_processor,3) + beam_search_generated_text = beam_search_generator.run("Vernon", 56) - result = generated_text + result = beam_search_generated_text assert result diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -6 +8 From 50c9f16df14281caafcd52016d41bdd9f272032a Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 12:00:55 +0300 Subject: [PATCH 077/107] fill_from_ngrams, set_n_grams, reader init and load --- lab_3_generate_by_ngrams/main.py | 51 ++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 371082fae..2fe335465 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -7,6 +7,7 @@ from typing import Optional import math import operator +import json class TextProcessor: @@ -197,6 +198,13 @@ def fill_from_ngrams(self, content: dict) -> None: Args: content (dict): ngrams from external JSON """ + if not isinstance(content, dict) or not content: + return None + + for key in content['freq']: + for n_gram in key: + if n_gram.isalpha(): + self._put(n_gram) def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: """ @@ -297,6 +305,10 @@ def set_n_grams(self, frequencies: dict) -> None: Args: frequencies (dict): Computed in advance frequencies for n-grams """ + if not isinstance(frequencies, dict) or not frequencies: + return None + + self._n_gram_frequencies.update(frequencies) def build(self) -> int: """ @@ -692,6 +704,14 @@ def __init__(self, json_path: str, eow_token: str) -> None: json_path (str): Local path to assets file eow_token (str): Special token for text processor """ + self._json_path = json_path + self._eow_token = eow_token + self._text_processor = TextProcessor(self._eow_token) + + with open(self._json_path, 'r', encoding='utf-8') as file: + json_text = json.load(file) + self._content = json_text + self._text_processor.fill_from_ngrams(self._content) def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: """ @@ -708,6 +728,36 @@ def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: In case of corrupt input arguments or unexpected behaviour of methods used, return 1. """ + if not isinstance(n_gram_size, int) or not n_gram_size or n_gram_size < 2: + return None + + frequencies = {} + for key in self._content['freq']: + encoded = [] + for element in key: + if element == ' ': + encoded.append(0) + elif element.isalpha(): + encoded.append(self._text_processor.get_id(element.lower())) + + if tuple(encoded) not in frequencies: + frequencies[tuple(encoded)] = 0 + frequencies[tuple(encoded)] += self._content['freq'][key] + + right_ngrams = {} + for ngram in frequencies: + if len(ngram) == n_gram_size: + abs_freq = frequencies[ngram] + rel_freq = 0 + for ngram_to_compare in frequencies: + if ngram_to_compare[:n_gram_size - 1] == ngram[:n_gram_size - 1]: + rel_freq += 1 + freq = abs_freq / rel_freq + right_ngrams[ngram] = freq + + lang_model = NGramLanguageModel(None, n_gram_size) + lang_model.set_n_grams(right_ngrams) + return lang_model def get_text_processor(self) -> TextProcessor: # type: ignore[empty-body] """ @@ -740,6 +790,7 @@ def __init__( text_processor (TextProcessor): A TextProcessor instance to handle text processing """ + def run(self, seq_len: int, prompt: str) -> Optional[str]: """ Generate sequence based on NGram language model and prompt provided. From 661dd904957d2c17eac347a75fd951888603f82f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 12:12:48 +0300 Subject: [PATCH 078/107] reader load fixed --- lab_3_generate_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 2fe335465..5f5e062b6 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -751,7 +751,7 @@ def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: rel_freq = 0 for ngram_to_compare in frequencies: if ngram_to_compare[:n_gram_size - 1] == ngram[:n_gram_size - 1]: - rel_freq += 1 + rel_freq += frequencies[ngram_to_compare] freq = abs_freq / rel_freq right_ngrams[ngram] = freq From ad2890cdf19b22613f561f97efebf54555399776 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Thu, 30 Nov 2023 15:46:17 +0300 Subject: [PATCH 079/107] checkout labs from the origin repository --- lab_3_generate_by_ngrams/main.py | 55 -------------------------------- 1 file changed, 55 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 2b6c1eec2..5f5e062b6 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -4,8 +4,6 @@ Beam-search and natural language generation evaluation """ # pylint:disable=too-few-public-methods -import json -import math from typing import Optional import math import operator @@ -768,7 +766,6 @@ def get_text_processor(self) -> TextProcessor: # type: ignore[empty-body] Returns: TextProcessor: processor created for the current JSON file. """ - return self._text_processor class BackOffGenerator: @@ -792,8 +789,6 @@ def __init__( language_models (tuple[NGramLanguageModel]): Language models to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ - self._language_models = {model.get_n_gram_size(): model for model in language_models} - self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: @@ -810,39 +805,6 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if not ( - isinstance(seq_len, int) and isinstance(prompt, str) and prompt - ): - return None - - encoded_prompt = self._text_processor.encode(prompt) - - if encoded_prompt is None: - return None - - iteration = 1 - generated_sequence = list(encoded_prompt) - while iteration <= seq_len: - next_token_candidates = None - for n_gram_size in sorted(self._language_models.keys(), reverse=True): - next_token_candidates = self._get_next_token( - tuple(generated_sequence[-(n_gram_size - 1):])) - if next_token_candidates is not None and len(next_token_candidates) > 0: - break - - if next_token_candidates is None or len(next_token_candidates) == 0: - break - - max_prob = max(next_token_candidates.values()) - max_probability_token = [token for token, prob in next_token_candidates.items() - if prob == max_prob] - generated_sequence.append(max_probability_token[0]) - - iteration += 1 - - decoded_sequence = self._text_processor.decode(tuple(generated_sequence)) - - return decoded_sequence def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dict[int, float]]: """ @@ -857,20 +819,3 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic In case of corrupt input arguments return None. """ - if not (isinstance(sequence_to_continue, tuple) and sequence_to_continue - and self._language_models): - return None - - n_gram_sizes = sorted(self._language_models.keys(), reverse=True) - - for n_gram_size in n_gram_sizes: - n_gram_model = self._language_models[n_gram_size] - - token_candidates = n_gram_model.generate_next_token(sequence_to_continue) - - if token_candidates is not None and len(token_candidates) > 0: - token_probabilities = {token: freq / sum(token_candidates.values()) - for token, freq in token_candidates.items()} - return token_probabilities - - return None From c61225601dc2f1f10f690f867b2aab941d901f98 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 21:19:51 +0300 Subject: [PATCH 080/107] GreedyTextGenerator run() fixed, BackOffGenerator --- lab_3_generate_by_ngrams/main.py | 105 +++++++++++++--------- lab_3_generate_by_ngrams/start.py | 20 +++-- lab_3_generate_by_ngrams/target_score.txt | 2 +- 3 files changed, 78 insertions(+), 49 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 5f5e062b6..378b61597 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -55,11 +55,10 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: tokenized_text.append(element) elif element.isdigit(): pass - else: - if not index: - pass - elif tokenized_text[-1] != self._end_of_word_token: - tokenized_text.append(self._end_of_word_token) + elif not index: + pass + elif tokenized_text[-1] != self._end_of_word_token: + tokenized_text.append(self._end_of_word_token) if not tokenized_text: return None @@ -202,9 +201,9 @@ def fill_from_ngrams(self, content: dict) -> None: return None for key in content['freq']: - for n_gram in key: - if n_gram.isalpha(): - self._put(n_gram) + for element in key: + if element.isalpha(): + self._put(element.lower()) def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: """ @@ -385,16 +384,8 @@ def _extract_n_grams( """ if not isinstance(encoded_corpus, tuple) or not encoded_corpus: return None - - n_grams_list = [] - for index, number in enumerate(encoded_corpus): - if index == len(encoded_corpus) - (self._n_gram_size - 1): - break - n_gram = [] - n_gram.extend(encoded_corpus[index:index + self._n_gram_size]) - n_grams_list.append(tuple(n_gram)) - - return tuple(n_grams_list) + return tuple(encoded_corpus[index:index + self._n_gram_size] + for index in range(len(encoded_corpus) - 1)) class GreedyTextGenerator: @@ -434,26 +425,25 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: if not isinstance(seq_len, int) or not isinstance(prompt, str) or not prompt: return None - encoded_text_tuple = self._text_processor.encode(prompt) - if encoded_text_tuple is None: + n_gram_size = self._model.get_n_gram_size() + encoded = self._text_processor.encode(prompt) + if not encoded or not n_gram_size: return None - encoded_text_list = list(encoded_text_tuple) - for i in range(seq_len): - possible_tokens = self._model.generate_next_token(encoded_text_tuple) - if possible_tokens is None: - return None - if not possible_tokens: + max_freq = [] + for _ in range(seq_len): + tokens = self._model.generate_next_token(encoded[-n_gram_size + 1:]) + if not tokens: break - possible_tokens_list = list(possible_tokens.items()) - possible_tokens_list.sort(key=operator.itemgetter(1,0), reverse=True) + max_freq.append(max(tokens.values())) + candidates_max = filter(lambda token_freq: token_freq[1] == max_freq[-1], + tokens.items()) + encoded += (sorted(candidates_max)[0][0],) - encoded_text_list.append(possible_tokens_list[0][0]) - encoded_text_tuple = tuple(encoded_text_list) - - decoded_text = self._text_processor.decode(encoded_text_tuple) - - return decoded_text + text = self._text_processor.decode(encoded) + if not text: + return None + return text class BeamSearcher: @@ -679,11 +669,7 @@ def _get_next_token( if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue: return None - next_token = self.beam_searcher.get_next_token(sequence_to_continue) - if next_token is None: - return None - - return next_token + return self.beam_searcher.get_next_token(sequence_to_continue) class NGramLanguageModelReader: @@ -766,6 +752,7 @@ def get_text_processor(self) -> TextProcessor: # type: ignore[empty-body] Returns: TextProcessor: processor created for the current JSON file. """ + return self._text_processor class BackOffGenerator: @@ -789,7 +776,8 @@ def __init__( language_models (tuple[NGramLanguageModel]): Language models to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ - + self._language_models = {lang_model.get_n_gram_size(): lang_model for lang_model in language_models} + self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: """ @@ -805,6 +793,26 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if not isinstance(seq_len, int) or not isinstance(prompt, str) or not prompt: + return None + + encoded_sequence = self._text_processor.encode(prompt) + if not encoded_sequence: + return None + for _ in range(seq_len): + candidates = self._get_next_token(encoded_sequence) + if not candidates: + break + max_probability = max(candidates.values()) + best_candidates = [] + for element, freq in candidates.items(): + if freq == max_probability: + best_candidates.append(element) + encoded_sequence += (best_candidates[0],) + decoded = self._text_processor.decode(encoded_sequence) + if not decoded: + return None + return decoded def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dict[int, float]]: """ @@ -819,3 +827,20 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic In case of corrupt input arguments return None. """ + if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue: + return None + + ngram_size_list = list(self._language_models.keys()) + if not ngram_size_list: + return None + ngram_size_list.sort(reverse=True) + for ngram_size in ngram_size_list: + candidates = self._language_models[ngram_size].generate_next_token(sequence_to_continue) + if candidates is None: + return None + if not candidates: + continue + else: + return candidates + + return None diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index b8ff775c6..72909973e 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -12,18 +12,22 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - text_processor = main_py.TextProcessor('_') - encoded_corpus = text_processor.encode(text) - decoded_text = text_processor.decode(encoded_corpus) + text_processor = main_py.TextProcessor('_') + encoded_corpus = text_processor.encode(text) + if isinstance(encoded_corpus, tuple) and encoded_corpus: + # decoded_text = str(text_processor.decode(encoded_corpus)) + + # language_model = main_py.NGramLanguageModel(encoded_corpus, 3) language_model = main_py.NGramLanguageModel(encoded_corpus, 7) - greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) - generated_text = greedy_generator.run(51, 'Vernon') - beam_search_generator = main_py.BeamSearchTextGenerator(language_model,text_processor,3) + # greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) + # generated_text = greedy_generator.run(51, 'Vernon') + + beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor,7) beam_search_generated_text = beam_search_generator.run("Vernon", 56) - result = beam_search_generated_text - assert result + result = beam_search_generated_text + assert result if __name__ == "__main__": diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 45a4fb75d..f599e28b8 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -8 +10 From 3d6af1376ddab75ddb1d86e4de68ca44583a2fdf Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 21:33:38 +0300 Subject: [PATCH 081/107] import fixing --- lab_3_generate_by_ngrams/main.py | 8 ++++---- lab_3_generate_by_ngrams/start.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 378b61597..a320f3671 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -184,11 +184,11 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: if decoded_corpus is None: return None - postpocessed_text = self._postprocess_decoded_text(decoded_corpus) - if postpocessed_text is None: + postprocessed_text = self._postprocess_decoded_text(decoded_corpus) + if postprocessed_text is None: return None - return postpocessed_text + return postprocessed_text def fill_from_ngrams(self, content: dict) -> None: """ @@ -495,7 +495,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, return [] possible_tokens_list = list(possible_tokens.items()) - sorted_tokens_list = sorted(possible_tokens_list, key=operator.itemgetter(1,0), reverse=True) + sorted_tokens_list = sorted(possible_tokens_list, key=operator.itemgetter(1, 0), reverse=True) best_tokens = sorted_tokens_list[:self._beam_width] return best_tokens diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 72909973e..342ecb03d 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,7 +1,7 @@ """ Generation by NGrams starter """ -import lab_3_generate_by_ngrams.main as main_py +from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, NGramLanguageModel, TextProcessor) def main() -> None: @@ -12,18 +12,18 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - text_processor = main_py.TextProcessor('_') + text_processor = TextProcessor('_') encoded_corpus = text_processor.encode(text) if isinstance(encoded_corpus, tuple) and encoded_corpus: # decoded_text = str(text_processor.decode(encoded_corpus)) # language_model = main_py.NGramLanguageModel(encoded_corpus, 3) - language_model = main_py.NGramLanguageModel(encoded_corpus, 7) + language_model = NGramLanguageModel(encoded_corpus, 7) # greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) # generated_text = greedy_generator.run(51, 'Vernon') - beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor,7) + beam_search_generator = BeamSearchTextGenerator(language_model, text_processor, 7) beam_search_generated_text = beam_search_generator.run("Vernon", 56) result = beam_search_generated_text From 7210c6573df29d2e86a48841f15d71ca4ddb269b Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 21:38:35 +0300 Subject: [PATCH 082/107] import fixing --- lab_3_generate_by_ngrams/start.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 342ecb03d..24c2b50c1 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,7 +1,8 @@ """ Generation by NGrams starter """ -from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, NGramLanguageModel, TextProcessor) +from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, + NGramLanguageModel, TextProcessor) def main() -> None: @@ -12,21 +13,21 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - text_processor = TextProcessor('_') - encoded_corpus = text_processor.encode(text) - if isinstance(encoded_corpus, tuple) and encoded_corpus: - # decoded_text = str(text_processor.decode(encoded_corpus)) - # language_model = main_py.NGramLanguageModel(encoded_corpus, 3) + processor = TextProcessor(end_of_word_token='_') + encoded = processor.encode(text) + if isinstance(encoded, tuple) and encoded: + decoded = str(processor.decode(encoded)) + result = decoded - language_model = NGramLanguageModel(encoded_corpus, 7) - # greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor) - # generated_text = greedy_generator.run(51, 'Vernon') + ng_model = NGramLanguageModel(encoded, n_gram_size=3) - beam_search_generator = BeamSearchTextGenerator(language_model, text_processor, 7) - beam_search_generated_text = beam_search_generator.run("Vernon", 56) + model_6 = NGramLanguageModel(encoded, 7) + greedy_text_generator = GreedyTextGenerator(model_6, processor) - result = beam_search_generated_text + beam_search_generator = BeamSearchTextGenerator(model_6, processor, 7) + print(beam_search_generator.run('Vernon', 56)) + result = beam_search_generator.run('Vernon', 56) assert result From a7a14e56cb81c704620cd7cf648d01fe85fd9c24 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 21:52:17 +0300 Subject: [PATCH 083/107] start.py fixing --- lab_3_generate_by_ngrams/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 24c2b50c1..e081b9334 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -27,7 +27,7 @@ def main() -> None: beam_search_generator = BeamSearchTextGenerator(model_6, processor, 7) print(beam_search_generator.run('Vernon', 56)) - result = beam_search_generator.run('Vernon', 56) + # result = beam_search_generator.run('Vernon', 56) assert result From 362c9e39e8e578cadb4b0d3d030e0e02ab0a43a6 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 22:01:59 +0300 Subject: [PATCH 084/107] code style fixing --- lab_3_generate_by_ngrams/main.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index a320f3671..3baf38417 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -130,7 +130,7 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or not len(text): + if not isinstance(text, str) or not text: return None tokenized_text = self._tokenize(text) @@ -205,6 +205,8 @@ def fill_from_ngrams(self, content: dict) -> None: if element.isalpha(): self._put(element.lower()) + return None + def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: """ Decode sentence by replacing ids with corresponding letters. @@ -308,6 +310,7 @@ def set_n_grams(self, frequencies: dict) -> None: return None self._n_gram_frequencies.update(frequencies) + return None def build(self) -> int: """ @@ -495,7 +498,8 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, return [] possible_tokens_list = list(possible_tokens.items()) - sorted_tokens_list = sorted(possible_tokens_list, key=operator.itemgetter(1, 0), reverse=True) + sorted_tokens_list = sorted(possible_tokens_list, + key=operator.itemgetter(1, 0), reverse=True) best_tokens = sorted_tokens_list[:self._beam_width] return best_tokens @@ -614,7 +618,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if not isinstance(seq_len, int) or seq_len <= 0 or not isinstance(prompt, str) or not prompt: + if (not isinstance(seq_len, int) or seq_len <= 0 or + not isinstance(prompt, str) or not prompt): return None encoded_prompt = self._text_processor.encode(prompt) @@ -633,9 +638,11 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: if not next_tokens: return None - possible_sequences = self.beam_searcher.continue_sequence(sequence, next_tokens, updated_candidates) + possible_sequences = self.beam_searcher.continue_sequence(sequence, + next_tokens, updated_candidates) if not possible_sequences: - return self._text_processor.decode(sorted(tuple(updated_candidates), key=lambda x: x[1])[0]) + return self._text_processor.decode(sorted(tuple(updated_candidates), + key=lambda x: x[1])[0]) not_sorted_candidates.update(possible_sequences) @@ -776,7 +783,8 @@ def __init__( language_models (tuple[NGramLanguageModel]): Language models to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ - self._language_models = {lang_model.get_n_gram_size(): lang_model for lang_model in language_models} + self._language_models = {lang_model.get_n_gram_size(): lang_model + for lang_model in language_models} self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: From 0ac0542c814c4f5536ec89975c468c4e8cabd763 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 22:11:28 +0300 Subject: [PATCH 085/107] code style fixing --- lab_3_generate_by_ngrams/main.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 3baf38417..f71fd60a5 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -639,7 +639,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: return None possible_sequences = self.beam_searcher.continue_sequence(sequence, - next_tokens, updated_candidates) + next_tokens, + updated_candidates) if not possible_sequences: return self._text_processor.decode(sorted(tuple(updated_candidates), key=lambda x: x[1])[0]) @@ -738,15 +739,15 @@ def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: frequencies[tuple(encoded)] += self._content['freq'][key] right_ngrams = {} - for ngram in frequencies: - if len(ngram) == n_gram_size: - abs_freq = frequencies[ngram] + for key, value in frequencies.items(): + if len(key) == n_gram_size: + abs_freq = value rel_freq = 0 - for ngram_to_compare in frequencies: - if ngram_to_compare[:n_gram_size - 1] == ngram[:n_gram_size - 1]: - rel_freq += frequencies[ngram_to_compare] + for ngram_to_compare, frequency in frequencies.items(): + if ngram_to_compare[:n_gram_size - 1] == key[:n_gram_size - 1]: + rel_freq += frequency freq = abs_freq / rel_freq - right_ngrams[ngram] = freq + right_ngrams[key] = freq lang_model = NGramLanguageModel(None, n_gram_size) lang_model.set_n_grams(right_ngrams) @@ -848,7 +849,6 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic return None if not candidates: continue - else: - return candidates + return candidates return None From 007e542b0e64d24f0c7ea1a94eac19699100f3ea Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 22:37:10 +0300 Subject: [PATCH 086/107] MyPy check fixing --- lab_3_generate_by_ngrams/main.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index f71fd60a5..cb3777d7c 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -146,7 +146,7 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: return None processed_text.append(self.get_id(token)) - return tuple(processed_text) + return (*processed_text, ) def _put(self, element: str) -> None: """ @@ -231,7 +231,7 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: if decoded_corpus is None or not decoded_corpus: return None - return tuple(decoded_corpus) + return (*decoded_corpus, ) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: """ @@ -732,7 +732,9 @@ def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: if element == ' ': encoded.append(0) elif element.isalpha(): - encoded.append(self._text_processor.get_id(element.lower())) + ident = self._text_processor.get_id(element.lower()) + if isinstance(ident, int): + encoded.append(ident) if tuple(encoded) not in frequencies: frequencies[tuple(encoded)] = 0 From 266740d1b59d82dc588239a3273d4adabfeba2ce Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 22:41:24 +0300 Subject: [PATCH 087/107] MyPy check fixing --- lab_3_generate_by_ngrams/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index cb3777d7c..1e9bec289 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -144,7 +144,8 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: for token in tokenized_text: if self.get_id(token) is None: return None - processed_text.append(self.get_id(token)) + elif isinstance(self.get_id(token), int): + processed_text.append(self.get_id(token)) return (*processed_text, ) @@ -225,7 +226,7 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: decoded_corpus = [] for identifier in corpus: - if self.get_token(identifier) is not None: + if isinstance(self.get_token(identifier), str): decoded_corpus.append(self.get_token(identifier)) if decoded_corpus is None or not decoded_corpus: From daa3a9421dbf318513c89a34c00fb47a9005585f Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 22:48:16 +0300 Subject: [PATCH 088/107] MyPy check fixing --- lab_3_generate_by_ngrams/main.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 1e9bec289..e7964c4df 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -144,10 +144,13 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: for token in tokenized_text: if self.get_id(token) is None: return None - elif isinstance(self.get_id(token), int): - processed_text.append(self.get_id(token)) + processed_text.append(self.get_id(token)) - return (*processed_text, ) + for ident in processed_text: + if not isinstance(ident, int): + return None + + return tuple(processed_text) def _put(self, element: str) -> None: """ @@ -226,12 +229,16 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: decoded_corpus = [] for identifier in corpus: - if isinstance(self.get_token(identifier), str): + if self.get_token(identifier) is not None: decoded_corpus.append(self.get_token(identifier)) if decoded_corpus is None or not decoded_corpus: return None + for token in decoded_corpus: + if not isinstance(token, str): + return None + return (*decoded_corpus, ) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: From f82708b97a48694f8190636186cb434628b52d48 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Thu, 30 Nov 2023 23:02:09 +0300 Subject: [PATCH 089/107] MyPy check fixing --- lab_3_generate_by_ngrams/main.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index e7964c4df..6eb3d184e 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -134,21 +134,16 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: return None tokenized_text = self._tokenize(text) - if tokenized_text is None: + if tokenized_text is None or not tokenized_text: return None - for token in tokenized_text: - self._put(token) - processed_text = [] for token in tokenized_text: - if self.get_id(token) is None: - return None - processed_text.append(self.get_id(token)) - - for ident in processed_text: - if not isinstance(ident, int): + self._put(token) + identifier = self.get_id(token) + if not isinstance(identifier, int): return None + processed_text.append(identifier) return tuple(processed_text) @@ -229,17 +224,14 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: decoded_corpus = [] for identifier in corpus: - if self.get_token(identifier) is not None: - decoded_corpus.append(self.get_token(identifier)) - - if decoded_corpus is None or not decoded_corpus: - return None - - for token in decoded_corpus: - if not isinstance(token, str): + if not isinstance(identifier, int): + return None + token = self.get_token(identifier) + if not token: return None + decoded_corpus.append(token) - return (*decoded_corpus, ) + return tuple(decoded_corpus) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: """ From c8b445cc65be3dc6d8d1d6c016039ba360190957 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 1 Dec 2023 12:19:10 +0300 Subject: [PATCH 090/107] fixed --- lab_3_generate_by_ngrams/main.py | 43 ++++++++++++++----------------- lab_3_generate_by_ngrams/start.py | 8 +++--- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 6eb3d184e..653e7723d 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -3,11 +3,11 @@ Beam-search and natural language generation evaluation """ -# pylint:disable=too-few-public-methods -from typing import Optional +import json import math import operator -import json +# pylint:disable=too-few-public-methods +from typing import Optional class TextProcessor: @@ -46,24 +46,21 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str): + if not isinstance(text, str) or not text: return None - tokenized_text = [] - for index, element in enumerate(text.lower()): - if element.isalpha(): - tokenized_text.append(element) - elif element.isdigit(): - pass - elif not index: - pass - elif tokenized_text[-1] != self._end_of_word_token: - tokenized_text.append(self._end_of_word_token) + tokens = [] + for token in text.lower(): + if token.isspace() and tokens[-1] != self._end_of_word_token: + tokens.append(self._end_of_word_token) + elif token.isalpha(): + tokens.append(token) + if not text[-1].isalnum() and tokens[-1] != self._end_of_word_token: + tokens.append(self._end_of_word_token) - if not tokenized_text: + if not tokens: return None - - return tuple(tokenized_text) + return tuple(tokens) def get_id(self, element: str) -> Optional[int]: """ @@ -838,7 +835,8 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic In case of corrupt input arguments return None. """ - if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue: + if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue or \ + not self._language_models: return None ngram_size_list = list(self._language_models.keys()) @@ -847,10 +845,9 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic ngram_size_list.sort(reverse=True) for ngram_size in ngram_size_list: candidates = self._language_models[ngram_size].generate_next_token(sequence_to_continue) - if candidates is None: - return None - if not candidates: - continue - return candidates + if candidates is not None and len(candidates) > 0: + probabilities = {token: freq / sum(candidates.values()) + for token, freq in candidates.items()} + return probabilities return None diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index e081b9334..fbb5f03d1 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -20,12 +20,12 @@ def main() -> None: decoded = str(processor.decode(encoded)) result = decoded - ng_model = NGramLanguageModel(encoded, n_gram_size=3) + # ng_model = NGramLanguageModel(encoded, n_gram_size=3) - model_6 = NGramLanguageModel(encoded, 7) - greedy_text_generator = GreedyTextGenerator(model_6, processor) + model = NGramLanguageModel(encoded, 7) + # greedy_text_generator = GreedyTextGenerator(model_6, processor) - beam_search_generator = BeamSearchTextGenerator(model_6, processor, 7) + beam_search_generator = BeamSearchTextGenerator(model, processor, 7) print(beam_search_generator.run('Vernon', 56)) # result = beam_search_generator.run('Vernon', 56) assert result From ff7301d19fe96150071263c85e2606ac1c7d3026 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 1 Dec 2023 12:21:59 +0300 Subject: [PATCH 091/107] fixed --- lab_3_generate_by_ngrams/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index fbb5f03d1..713c9ebdb 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,7 +1,7 @@ """ Generation by NGrams starter """ -from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, +from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, NGramLanguageModel, TextProcessor) From a42431b3cdd15d4c9c28ddd791643fa0dac51f1c Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 1 Dec 2023 12:24:07 +0300 Subject: [PATCH 092/107] fixed --- lab_3_generate_by_ngrams/start.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 713c9ebdb..826fe2720 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,7 +1,7 @@ """ Generation by NGrams starter """ -from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, +from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) @@ -20,10 +20,10 @@ def main() -> None: decoded = str(processor.decode(encoded)) result = decoded - # ng_model = NGramLanguageModel(encoded, n_gram_size=3) + ng_model = NGramLanguageModel(encoded, n_gram_size=3) model = NGramLanguageModel(encoded, 7) - # greedy_text_generator = GreedyTextGenerator(model_6, processor) + greedy_text_generator = GreedyTextGenerator(model_6, processor) beam_search_generator = BeamSearchTextGenerator(model, processor, 7) print(beam_search_generator.run('Vernon', 56)) From 72c0ae3bb1ba8c3d1b257d8345dcad187446bad1 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 1 Dec 2023 12:27:38 +0300 Subject: [PATCH 093/107] fixed --- lab_3_generate_by_ngrams/start.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 826fe2720..e081b9334 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -22,10 +22,10 @@ def main() -> None: ng_model = NGramLanguageModel(encoded, n_gram_size=3) - model = NGramLanguageModel(encoded, 7) + model_6 = NGramLanguageModel(encoded, 7) greedy_text_generator = GreedyTextGenerator(model_6, processor) - beam_search_generator = BeamSearchTextGenerator(model, processor, 7) + beam_search_generator = BeamSearchTextGenerator(model_6, processor, 7) print(beam_search_generator.run('Vernon', 56)) # result = beam_search_generator.run('Vernon', 56) assert result From 6559d6b8837e2b215d27d879891135489ae8c5ad Mon Sep 17 00:00:00 2001 From: Artyom Tugaryov Date: Sat, 2 Dec 2023 18:11:56 +0300 Subject: [PATCH 094/107] Merge with main --- lab_3_generate_by_ngrams/main.py | 435 +++++++++++++++--------------- lab_3_generate_by_ngrams/start.py | 22 +- 2 files changed, 229 insertions(+), 228 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 653e7723d..b0d43684c 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -3,10 +3,9 @@ Beam-search and natural language generation evaluation """ +# pylint:disable=too-few-public-methods import json import math -import operator -# pylint:disable=too-few-public-methods from typing import Optional @@ -27,7 +26,7 @@ def __init__(self, end_of_word_token: str) -> None: end_of_word_token (str): A token denoting word boundary """ self._end_of_word_token = end_of_word_token - self._storage = {self._end_of_word_token: 0} + self._storage = {end_of_word_token: 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ @@ -46,20 +45,23 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or not text: + if not isinstance(text, str) or len(text) == 0: return None + text_words = text.lower().split() tokens = [] - for token in text.lower(): - if token.isspace() and tokens[-1] != self._end_of_word_token: + for word in text_words: + word_tokens = [alpha for alpha in word if alpha.isalpha()] + tokens.extend(word_tokens) + if word_tokens: tokens.append(self._end_of_word_token) - elif token.isalpha(): - tokens.append(token) - if not text[-1].isalnum() and tokens[-1] != self._end_of_word_token: - tokens.append(self._end_of_word_token) if not tokens: return None + + if text[-1].isdigit() or text[-1].isalpha(): + tokens.pop(-1) + return tuple(tokens) def get_id(self, element: str) -> Optional[int]: @@ -75,7 +77,7 @@ def get_id(self, element: str) -> Optional[int]: In case of corrupt input arguments or arguments not included in storage, None is returned """ - if not isinstance(element, str) or element not in self._storage: + if not (isinstance(element, str) and element in self._storage): return None return self._storage[element] @@ -104,12 +106,12 @@ def get_token(self, element_id: int) -> Optional[str]: if not isinstance(element_id, int): return None - inv_storage = {identifier: element for element, identifier in self._storage.items()} + filtered_items = filter(lambda item: item[1] == element_id, self._storage.items()) + token = next(filtered_items, None) + if token: + return token[0] - if element_id not in inv_storage: - return None - - return inv_storage[element_id] + return None def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -127,22 +129,22 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or not text: + if not (isinstance(text, str) and text): return None tokenized_text = self._tokenize(text) - if tokenized_text is None or not tokenized_text: + if not tokenized_text: return None - processed_text = [] + encoded_corpus = [] for token in tokenized_text: self._put(token) - identifier = self.get_id(token) - if not isinstance(identifier, int): + token_id = self.get_id(token) + if not isinstance(token_id, int): return None - processed_text.append(identifier) + encoded_corpus.append(token_id) - return tuple(processed_text) + return tuple(encoded_corpus) def _put(self, element: str) -> None: """ @@ -154,8 +156,15 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ - if isinstance(element, str) and len(element) == 1 and element not in self._storage: - self._storage[element] = len(self._storage) + if not isinstance(element, str) or len(element) != 1: + return None + + if element in self._storage: + return None + + self._storage[element] = len(self._storage) + + return None def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -173,18 +182,18 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(encoded_corpus, tuple) or not encoded_corpus: + if not isinstance(encoded_corpus, tuple): return None decoded_corpus = self._decode(encoded_corpus) - if decoded_corpus is None: + if not decoded_corpus: return None - postprocessed_text = self._postprocess_decoded_text(decoded_corpus) - if postprocessed_text is None: + decoded_text = self._postprocess_decoded_text(decoded_corpus) + if not decoded_text: return None - return postprocessed_text + return decoded_text def fill_from_ngrams(self, content: dict) -> None: """ @@ -193,13 +202,12 @@ def fill_from_ngrams(self, content: dict) -> None: Args: content (dict): ngrams from external JSON """ - if not isinstance(content, dict) or not content: + if not isinstance(content, dict) or len(content) == 0: return None - for key in content['freq']: - for element in key: - if element.isalpha(): - self._put(element.lower()) + for token in (char for n_gram in content['freq'] + for char in n_gram.lower() if char.isalpha()): + self._put(token) return None @@ -216,14 +224,12 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(corpus, tuple) or not corpus: + if not isinstance(corpus, tuple) or len(corpus) == 0: return None decoded_corpus = [] - for identifier in corpus: - if not isinstance(identifier, int): - return None - token = self.get_token(identifier) + for ident in corpus: + token = self.get_token(ident) if not token: return None decoded_corpus.append(token) @@ -245,24 +251,15 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional In case of corrupt input arguments, None is returned """ - if not isinstance(decoded_corpus, tuple) or not decoded_corpus: + if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: return None - postprocessed_text = '' - for index, token in enumerate(decoded_corpus): - if not index: - postprocessed_text += token.upper() - elif token == self._end_of_word_token: - if index == len(decoded_corpus) - 1: - postprocessed_text += '.' - else: - postprocessed_text += ' ' - else: - postprocessed_text += token - if postprocessed_text[-1] != '.': - postprocessed_text += '.' + decoded_text = ''.join(decoded_corpus).replace('_', ' ').capitalize() + + if decoded_text[-1] == ' ': + return f"{decoded_text[:-1]}." - return postprocessed_text + return f"{decoded_text}." class NGramLanguageModel: @@ -303,10 +300,11 @@ def set_n_grams(self, frequencies: dict) -> None: Args: frequencies (dict): Computed in advance frequencies for n-grams """ - if not isinstance(frequencies, dict) or not frequencies: + if not isinstance(frequencies, dict) or len(frequencies) == 0: return None - self._n_gram_frequencies.update(frequencies) + self._n_gram_frequencies = frequencies + return None def build(self) -> int: @@ -321,23 +319,24 @@ def build(self) -> int: In case of corrupt input arguments or methods used return None, 1 is returned """ - if not isinstance(self._encoded_corpus, tuple) or not self._encoded_corpus: + if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0: return 1 n_grams = self._extract_n_grams(self._encoded_corpus) - if n_grams is None: + if not n_grams or not isinstance(n_grams, tuple): return 1 - for n_gram in n_grams: - abs_freq = n_grams.count(n_gram) - rel_freq = 0 - for n_gram_to_compare in n_grams: - if n_gram_to_compare[:self._n_gram_size - 1] == n_gram[:self._n_gram_size - 1]: - rel_freq += 1 - self._n_gram_frequencies[n_gram] = abs_freq / rel_freq + context_freq_dict = {} - if not self._n_gram_frequencies: - return 1 + for n_gram in set(n_grams): + absolute_freq = n_grams.count(n_gram) + context = n_gram[:-1] + if context not in context_freq_dict: + context_freq_dict[context] = [n_gram_1[:-1] == context + for n_gram_1 in n_grams].count(True) + context_freq = context_freq_dict[context] + + self._n_gram_frequencies[n_gram] = absolute_freq / context_freq return 0 @@ -353,20 +352,22 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ - if not isinstance(sequence, tuple) or not sequence or len(sequence) < self._n_gram_size - 1: + if (not isinstance(sequence, tuple) or len(sequence) == 0 + or len(sequence) < self._n_gram_size - 1): return None - possible_tokens = {} - - context = sequence[-(self._n_gram_size - 1):] + token_frequencies = {} - sort_data = dict(sorted(self._n_gram_frequencies.items(), key=lambda x: (x[1], list(x[0])))) + context_size = self._n_gram_size - 1 + context = sequence[-context_size:] - for n_gram, freq in sort_data.items(): - if n_gram[:self._n_gram_size - 1] == context: - possible_tokens[n_gram[-1]] = freq + for n_gram, freq in self._n_gram_frequencies.items(): + if n_gram[:-1] == context: + token = n_gram[-1] + if token not in token_frequencies: + token_frequencies[token] = freq - return possible_tokens + return token_frequencies def _extract_n_grams( self, encoded_corpus: tuple[int, ...] @@ -382,10 +383,16 @@ def _extract_n_grams( In case of corrupt input arguments, None is returned """ - if not isinstance(encoded_corpus, tuple) or not encoded_corpus: + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: return None - return tuple(encoded_corpus[index:index + self._n_gram_size] - for index in range(len(encoded_corpus) - 1)) + + n_gram_size = self._n_gram_size + n_grams = [] + for i in range(len(encoded_corpus) - n_gram_size + 1): + n_gram = tuple(encoded_corpus[i:i + n_gram_size]) + n_grams.append(n_gram) + + return tuple(n_grams) class GreedyTextGenerator: @@ -422,28 +429,31 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if not isinstance(seq_len, int) or not isinstance(prompt, str) or not prompt: + if not (isinstance(seq_len, int) and isinstance(prompt, str)) or len(prompt) == 0: return None + encoded_prompt = self._text_processor.encode(prompt) n_gram_size = self._model.get_n_gram_size() - encoded = self._text_processor.encode(prompt) - if not encoded or not n_gram_size: + if not (encoded_prompt and n_gram_size): return None - max_freq = [] - for _ in range(seq_len): - tokens = self._model.generate_next_token(encoded[-n_gram_size + 1:]) + while seq_len > 0: + tokens = self._model.generate_next_token(encoded_prompt[-n_gram_size + 1:]) if not tokens: break - max_freq.append(max(tokens.values())) - candidates_max = filter(lambda token_freq: token_freq[1] == max_freq[-1], - tokens.items()) - encoded += (sorted(candidates_max)[0][0],) - text = self._text_processor.decode(encoded) - if not text: + max_freq = max(tokens.values()) + max_freq_tokens = [token for token, freq in tokens.items() if freq == max_freq] + max_freq_tokens = sorted(max_freq_tokens) + encoded_prompt += (max_freq_tokens[0],) + + seq_len -= 1 + + decoded_text = self._text_processor.decode(encoded_prompt) + if not decoded_text: return None - return text + + return decoded_text class BeamSearcher: @@ -488,18 +498,14 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, if not isinstance(sequence, tuple) or not sequence: return None - possible_tokens = self._model.generate_next_token(sequence) - if possible_tokens is None: + tokens = self._model.generate_next_token(sequence) + if tokens is None: return None - if not possible_tokens: + if not tokens: return [] - possible_tokens_list = list(possible_tokens.items()) - sorted_tokens_list = sorted(possible_tokens_list, - key=operator.itemgetter(1, 0), reverse=True) - best_tokens = sorted_tokens_list[:self._beam_width] - - return best_tokens + return sorted([(token, float(freq)) for token, freq in tokens.items()], + key=lambda pair: pair[1], reverse=True)[:self._beam_width] def continue_sequence( self, @@ -522,29 +528,20 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ - if not isinstance(sequence, tuple) or not isinstance(next_tokens, list) \ - or not isinstance(sequence_candidates, dict): - return None - if not sequence or not next_tokens or not sequence_candidates: - return None - if sequence not in sequence_candidates: - return None - if len(next_tokens) > self._beam_width: + if not (isinstance(sequence, tuple) and isinstance(next_tokens, list) + and isinstance(sequence_candidates, dict) and sequence + and next_tokens and sequence_candidates and len(next_tokens) <= self._beam_width + and sequence in sequence_candidates): return None - copy_seq_candidates = sequence_candidates.copy() - list_sequence = list(sequence) - for token in next_tokens: - list_sequence.append(token[0]) - possible_seq = tuple(list_sequence) - freq = sequence_candidates[sequence] - math.log(token[-1]) - copy_seq_candidates[possible_seq] = freq - list_sequence = list_sequence[:-1] + new_sequence = sequence + (token[0],) + new_freq = sequence_candidates[sequence] - math.log(token[1]) + sequence_candidates[new_sequence] = new_freq - copy_seq_candidates.pop(sequence) + del sequence_candidates[sequence] - return copy_seq_candidates + return sequence_candidates def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] @@ -560,15 +557,11 @@ def prune_sequence_candidates( In case of corrupt input arguments return None. """ - if not isinstance(sequence_candidates, dict) or not sequence_candidates: + if not (isinstance(sequence_candidates, dict) and sequence_candidates): return None - sorted_sequences = sorted(sequence_candidates.items(), key=operator.itemgetter(1, 0)) - result = {} - for sequence in sorted_sequences[:self._beam_width]: - result[sequence[0]] = sequence[1] - - return result + return dict(sorted(list(sequence_candidates.items()), + key=lambda x: x[1])[:self._beam_width]) class BeamSearchTextGenerator: @@ -598,8 +591,8 @@ def __init__( """ self._text_processor = text_processor self._beam_width = beam_width + self.beam_searcher = BeamSearcher(self._beam_width, language_model) self._language_model = language_model - self.beam_searcher = BeamSearcher(self._beam_width, self._language_model) def run(self, prompt: str, seq_len: int) -> Optional[str]: """ @@ -615,46 +608,42 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if (not isinstance(seq_len, int) or seq_len <= 0 or - not isinstance(prompt, str) or not prompt): + if not (isinstance(prompt, str) and isinstance(seq_len, int) and prompt and seq_len): return None encoded_prompt = self._text_processor.encode(prompt) - if encoded_prompt is None: + if not encoded_prompt: return None - candidates = {encoded_prompt: 0.0} - updated_candidates = candidates.copy() + sequence_candidates = {encoded_prompt: 0.0} - for _ in range(seq_len): - candidates = updated_candidates - not_sorted_candidates = {} + for i in range(seq_len): + new_sequence_candidates = sequence_candidates.copy() - for sequence in candidates: - next_tokens = self._get_next_token(sequence) - if not next_tokens: + for sequence in sequence_candidates: + tokens = self._get_next_token(sequence) + if not tokens: return None - possible_sequences = self.beam_searcher.continue_sequence(sequence, - next_tokens, - updated_candidates) - if not possible_sequences: - return self._text_processor.decode(sorted(tuple(updated_candidates), + continuation_candidates = self.beam_searcher.continue_sequence( + sequence, tokens, new_sequence_candidates) + if not continuation_candidates: + return self._text_processor.decode(sorted(tuple(sequence_candidates), key=lambda x: x[1])[0]) - not_sorted_candidates.update(possible_sequences) + if not new_sequence_candidates: + break - for candidate in candidates: - if candidate in not_sorted_candidates: - del not_sorted_candidates[candidate] + best_sequence_candidates = self.beam_searcher.prune_sequence_candidates( + new_sequence_candidates) - sorted_candidates = self.beam_searcher.prune_sequence_candidates(not_sorted_candidates) - if not sorted_candidates: + if not best_sequence_candidates: return None + sequence_candidates = best_sequence_candidates - updated_candidates = sorted_candidates - - return self._text_processor.decode(sorted(tuple(updated_candidates), key=lambda x: x[1])[0]) + decoded = self._text_processor.decode(min(sequence_candidates, + key=lambda x: sequence_candidates[x])) + return decoded def _get_next_token( self, sequence_to_continue: tuple[int, ...] @@ -671,10 +660,14 @@ def _get_next_token( In case of corrupt input arguments return None. """ - if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue: + if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0: return None - return self.beam_searcher.get_next_token(sequence_to_continue) + tokens = self.beam_searcher.get_next_token(sequence_to_continue) + if not tokens: + return None + + return tokens class NGramLanguageModelReader: @@ -697,11 +690,9 @@ def __init__(self, json_path: str, eow_token: str) -> None: """ self._json_path = json_path self._eow_token = eow_token + with open(json_path, 'r', encoding="utf-8") as file: + self._content = json.load(file) self._text_processor = TextProcessor(self._eow_token) - - with open(self._json_path, 'r', encoding='utf-8') as file: - json_text = json.load(file) - self._content = json_text self._text_processor.fill_from_ngrams(self._content) def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: @@ -721,36 +712,33 @@ def load(self, n_gram_size: int) -> Optional[NGramLanguageModel]: """ if not isinstance(n_gram_size, int) or not n_gram_size or n_gram_size < 2: return None - - frequencies = {} - for key in self._content['freq']: + n_grams = {} + for n_gram in self._content['freq']: encoded = [] - for element in key: - if element == ' ': + for token in n_gram: + if token.isspace(): encoded.append(0) - elif element.isalpha(): - ident = self._text_processor.get_id(element.lower()) - if isinstance(ident, int): - encoded.append(ident) - - if tuple(encoded) not in frequencies: - frequencies[tuple(encoded)] = 0 - frequencies[tuple(encoded)] += self._content['freq'][key] - - right_ngrams = {} - for key, value in frequencies.items(): - if len(key) == n_gram_size: - abs_freq = value - rel_freq = 0 - for ngram_to_compare, frequency in frequencies.items(): - if ngram_to_compare[:n_gram_size - 1] == key[:n_gram_size - 1]: - rel_freq += frequency - freq = abs_freq / rel_freq - right_ngrams[key] = freq - - lang_model = NGramLanguageModel(None, n_gram_size) - lang_model.set_n_grams(right_ngrams) - return lang_model + elif token.isalpha(): + token_id = self._text_processor.get_id(token.lower()) + if not token_id: + continue + encoded.append(token_id) + + if tuple(encoded) not in n_grams: + n_grams[tuple(encoded)] = 0.0 + n_grams[tuple(encoded)] += self._content['freq'][n_gram] + + n_grams_cleared_by_size = {} + for n_gram, freq in n_grams.items(): + if len(n_gram) == n_gram_size and isinstance(n_gram, tuple): + same_context = [context_freq for context, context_freq in n_grams.items() + if context[-n_gram_size:-1] == n_gram[-n_gram_size:-1]] + n_grams_cleared_by_size[n_gram] = freq / sum(same_context) + + n_gram_model = NGramLanguageModel(None, n_gram_size) + n_gram_model.set_n_grams(n_grams_cleared_by_size) + + return n_gram_model def get_text_processor(self) -> TextProcessor: # type: ignore[empty-body] """ @@ -783,8 +771,7 @@ def __init__( language_models (tuple[NGramLanguageModel]): Language models to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ - self._language_models = {lang_model.get_n_gram_size(): lang_model - for lang_model in language_models} + self._language_models = {model.get_n_gram_size(): model for model in language_models} self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: @@ -801,26 +788,39 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ - if not isinstance(seq_len, int) or not isinstance(prompt, str) or not prompt: + if not ( + isinstance(seq_len, int) and isinstance(prompt, str) and prompt + ): return None - encoded_sequence = self._text_processor.encode(prompt) - if not encoded_sequence: + encoded_prompt = self._text_processor.encode(prompt) + + if encoded_prompt is None: return None - for _ in range(seq_len): - candidates = self._get_next_token(encoded_sequence) - if not candidates: + + iteration = 1 + generated_sequence = list(encoded_prompt) + while iteration <= seq_len: + next_token_candidates = None + for n_gram_size in sorted(self._language_models.keys(), reverse=True): + next_token_candidates = self._get_next_token( + tuple(generated_sequence[-(n_gram_size - 1):])) + if next_token_candidates is not None and len(next_token_candidates) > 0: + break + + if next_token_candidates is None or len(next_token_candidates) == 0: break - max_probability = max(candidates.values()) - best_candidates = [] - for element, freq in candidates.items(): - if freq == max_probability: - best_candidates.append(element) - encoded_sequence += (best_candidates[0],) - decoded = self._text_processor.decode(encoded_sequence) - if not decoded: - return None - return decoded + + max_prob = max(next_token_candidates.values()) + max_probability_token = [token for token, prob in next_token_candidates.items() + if prob == max_prob] + generated_sequence.append(max_probability_token[0]) + + iteration += 1 + + decoded_sequence = self._text_processor.decode(tuple(generated_sequence)) + + return decoded_sequence def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dict[int, float]]: """ @@ -835,19 +835,20 @@ def _get_next_token(self, sequence_to_continue: tuple[int, ...]) -> Optional[dic In case of corrupt input arguments return None. """ - if not isinstance(sequence_to_continue, tuple) or not sequence_to_continue or \ - not self._language_models: + if not (isinstance(sequence_to_continue, tuple) and sequence_to_continue + and self._language_models): return None - ngram_size_list = list(self._language_models.keys()) - if not ngram_size_list: - return None - ngram_size_list.sort(reverse=True) - for ngram_size in ngram_size_list: - candidates = self._language_models[ngram_size].generate_next_token(sequence_to_continue) - if candidates is not None and len(candidates) > 0: - probabilities = {token: freq / sum(candidates.values()) - for token, freq in candidates.items()} - return probabilities + n_gram_sizes = sorted(self._language_models.keys(), reverse=True) + + for n_gram_size in n_gram_sizes: + n_gram_model = self._language_models[n_gram_size] + + token_candidates = n_gram_model.generate_next_token(sequence_to_continue) + + if token_candidates is not None and len(token_candidates) > 0: + token_probabilities = {token: freq / sum(token_candidates.values()) + for token, freq in token_candidates.items()} + return token_probabilities return None diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index e081b9334..a4ec25e0f 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -13,22 +13,22 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - processor = TextProcessor(end_of_word_token='_') encoded = processor.encode(text) - if isinstance(encoded, tuple) and encoded: - decoded = str(processor.decode(encoded)) - result = decoded + if not(isinstance(encoded, tuple) and encoded): + return - ng_model = NGramLanguageModel(encoded, n_gram_size=3) + decoded = str(processor.decode(encoded)) + result = decoded - model_6 = NGramLanguageModel(encoded, 7) - greedy_text_generator = GreedyTextGenerator(model_6, processor) + n_gram_model = NGramLanguageModel(encoded[:100], n_gram_size=3) + model_7 = NGramLanguageModel(encoded, 7) + greedy_text_generator = GreedyTextGenerator(model_7, processor) + print(greedy_text_generator.run(51, 'Vernon')) - beam_search_generator = BeamSearchTextGenerator(model_6, processor, 7) - print(beam_search_generator.run('Vernon', 56)) - # result = beam_search_generator.run('Vernon', 56) - assert result + beam_search_generator = BeamSearchTextGenerator(model_7, processor, 7) + print(beam_search_generator.run('Vernon', 56)) + assert result if __name__ == "__main__": From bdb05b4c660ccd5666a1f23812bd3314cde31b1c Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 5 Dec 2023 17:58:15 +0300 Subject: [PATCH 095/107] steps 1-3 done --- lab_4_fill_words_by_ngrams/main.py | 78 ++++++++++++++++++++- lab_4_fill_words_by_ngrams/start.py | 8 ++- lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 3 files changed, 83 insertions(+), 5 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index b739ae182..5693fe574 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -6,6 +6,8 @@ # pylint:disable=too-few-public-methods, too-many-arguments from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) +from string import punctuation +from random import choice class WordProcessor(TextProcessor): @@ -28,6 +30,20 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(text, str) or not text: + raise ValueError('Type input is inappropriate or input argument is empty.') + preprocessed_text = '' + for element in text.lower(): + if element in '?!.': + preprocessed_text += ' ' + preprocessed_text += self.get_end_of_word_token() + elif element.isdigit() or (element.isspace() and preprocessed_text[-1].isspace()) \ + or element in punctuation: + pass + elif element.isalpha() or element.isspace(): + preprocessed_text += element + + return tuple(preprocessed_text.split(' ')) def _put(self, element: str) -> None: """ @@ -39,6 +55,11 @@ def _put(self, element: str) -> None: Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(element, str): + raise ValueError('Type input is inappropriate.') + if not element: + raise ValueError('Input argument is empty.') + self._storage[element] = len(self._storage) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore """ @@ -56,6 +77,25 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(decoded_corpus, tuple) or not decoded_corpus: + raise ValueError('Type input is inappropriate or input argument is empty.') + result = '' + for word in decoded_corpus: + if word == self.get_end_of_word_token(): + result = result[:-1] + result += '.' + else: + for letter in word: + if not result or (len(result) > 2 and result[-2] == '.'): + result += letter.upper() + else: + result += letter + result += ' ' + if result[-1] == ' ': + result = result[:-1] + if result[-1] != '.': + result += '.' + return result class TopPGenerator: @@ -69,7 +109,7 @@ class TopPGenerator: """ def __init__( - self, language_model: NGramLanguageModel, word_processor: WordProcessor, p_value: float + self, language_model: NGramLanguageModel, word_processor: WordProcessor, p_value: float ) -> None: """ Initialize an instance of TopPGenerator. @@ -80,6 +120,9 @@ def __init__( word_processor (WordProcessor): WordProcessor instance to handle text processing p_value (float): Collective probability mass threshold """ + self._model = language_model + self._word_processor = word_processor + self._p_value = p_value def run(self, seq_len: int, prompt: str) -> str: # type: ignore """ @@ -98,6 +141,35 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore or if sequence has inappropriate length, or if methods used return None. """ + if not isinstance(seq_len, int) or seq_len < 0 \ + or not isinstance(prompt, str) or not prompt: + raise ValueError('Type input is inappropriate or input argument is empty.') + encoded_prompt = self._word_processor.encode(prompt) + if encoded_prompt is None: + raise ValueError('None is returned') + encoded_list = list(encoded_prompt) + for _ in range(seq_len): + candidates = self._model.generate_next_token(encoded_prompt) + if candidates is None: + raise ValueError('None is returned') + if not candidates: + break + tuple_candidates = tuple(candidates.items()) + sorted_candidates = sorted(tuple_candidates, key=lambda tup: (tup[0], -tup[1])) + sum_freq = 0 + num_candidates = 0 + for candidate in sorted_candidates: + if sum_freq >= self._p_value: + break + sum_freq += candidate[1] + num_candidates += 1 + rand_token = choice(sorted_candidates[:num_candidates])[0] + encoded_list.append(rand_token) + encoded_prompt = tuple(encoded_list) + decoded = self._word_processor.decode(encoded_prompt) + if decoded is None: + raise ValueError('None is returned') + return decoded class GeneratorTypes: @@ -192,7 +264,7 @@ class QualityChecker: """ def __init__( - self, generators: dict, language_model: NGramLanguageModel, word_processor: WordProcessor + self, generators: dict, language_model: NGramLanguageModel, word_processor: WordProcessor ) -> None: """ Initialize an instance of QualityChecker. @@ -307,7 +379,7 @@ class GeneratorRuleStudent: _generator_type: int def __init__( - self, generator_type: int, language_model: NGramLanguageModel, word_processor: WordProcessor + self, generator_type: int, language_model: NGramLanguageModel, word_processor: WordProcessor ) -> None: """ Initialize an instance of GeneratorRuleStudent. diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index c41386377..df9b49246 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,6 +2,8 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import +from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, + WordProcessor) def main() -> None: @@ -10,7 +12,11 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None + word_processor = WordProcessor('') + encoded_text = word_processor.encode(text) + lang_model = NGramLanguageModel(encoded_text, 2) + top_p_generator = TopPGenerator(lang_model, word_processor, 0.5) + result = top_p_generator.run(51, 'Vernon') assert result diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 573541ac9..1e8b31496 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +6 From a491f5bb9cad416d6781520d79dba11031e62659 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Fri, 8 Dec 2023 11:12:43 +0300 Subject: [PATCH 096/107] step 4 done --- lab_4_fill_words_by_ngrams/main.py | 82 +++++++++++++++++++++++++++-- lab_4_fill_words_by_ngrams/start.py | 14 +++-- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 5693fe574..c14b14b13 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -8,6 +8,7 @@ NGramLanguageModel, TextProcessor) from string import punctuation from random import choice +import math class WordProcessor(TextProcessor): @@ -32,13 +33,15 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore """ if not isinstance(text, str) or not text: raise ValueError('Type input is inappropriate or input argument is empty.') + text = repr(text) + text = text.replace('\\n', ' ') preprocessed_text = '' for element in text.lower(): if element in '?!.': preprocessed_text += ' ' preprocessed_text += self.get_end_of_word_token() - elif element.isdigit() or (element.isspace() and preprocessed_text[-1].isspace()) \ - or element in punctuation: + elif element.isdigit() or (element.isspace() and len(preprocessed_text) > 1 + and preprocessed_text[-1].isspace()) or element in punctuation: pass elif element.isalpha() or element.isspace(): preprocessed_text += element @@ -59,7 +62,8 @@ def _put(self, element: str) -> None: raise ValueError('Type input is inappropriate.') if not element: raise ValueError('Input argument is empty.') - self._storage[element] = len(self._storage) + if element not in self._storage: + self._storage[element] = len(self._storage) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore """ @@ -155,7 +159,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore if not candidates: break tuple_candidates = tuple(candidates.items()) - sorted_candidates = sorted(tuple_candidates, key=lambda tup: (tup[0], -tup[1])) + sorted_candidates = sorted(tuple_candidates, key=lambda tup: (-tup[1], -tup[0])) sum_freq = 0 num_candidates = 0 for candidate in sorted_candidates: @@ -186,6 +190,9 @@ def __init__(self) -> None: """ Initialize an instance of GeneratorTypes. """ + self.greedy = 0 + self.top_p = 1 + self.beam_search = 2 def get_conversion_generator_type(self, generator_type: int) -> str: # type: ignore """ @@ -197,6 +204,12 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig Returns: (str): Name of the generator. """ + if generator_type == self.greedy: + return 'Greedy Generator' + if generator_type == self.top_p: + return 'Top-P Generator' + if generator_type == self.beam_search: + return 'Beam Search Generator' class GenerationResultDTO: @@ -219,6 +232,9 @@ def __init__(self, text: str, perplexity: float, generation_type: int): generation_type (int): Numeric type of the generator for which perplexity was calculated """ + self.__text = text + self.__perplexity = perplexity + self.__type = generation_type def get_perplexity(self) -> float: # type: ignore """ @@ -227,6 +243,7 @@ def get_perplexity(self) -> float: # type: ignore Returns: (float): Perplexity value """ + return self.__perplexity def get_text(self) -> str: # type: ignore """ @@ -235,6 +252,7 @@ def get_text(self) -> str: # type: ignore Returns: (str): Text for which the perplexity was count """ + return self.__text def get_type(self) -> int: # type: ignore """ @@ -243,6 +261,7 @@ def get_type(self) -> int: # type: ignore Returns: (int): Numeric type of the generator """ + return self.__type def __str__(self) -> str: # type: ignore """ @@ -251,6 +270,10 @@ def __str__(self) -> str: # type: ignore Returns: (str): String with report """ + generator_types = GeneratorTypes() + return (f'Perplexity score: {self.__perplexity}\n' + f'{generator_types.get_conversion_generator_type(self.__type)}\n' + f'Text: {self.__text}\n') class QualityChecker: @@ -275,6 +298,9 @@ def __init__( NGramLanguageModel instance to use for text generation word_processor (WordProcessor): WordProcessor instance to handle text processing """ + self._generators = generators + self._language_model = language_model + self._word_processor = word_processor def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore """ @@ -292,6 +318,29 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore or if methods used return None, or if nothing was generated. """ + if not generated_text: + raise ValueError('Input argument is empty') + if not isinstance(generated_text, str): + raise ValueError('Inappropriate type argument') + encoded_text = self._word_processor.encode(generated_text) + if encoded_text is None: + raise ValueError('self._word_processor.encode() returned None') + self._language_model.build() + # freq_dict = self._language_model._n_gram_frequencies + l_sum = 0 + num_ngrams = 0 + # ngrams = self._language_model._extract_n_grams(encoded_text) + ngrams = self._language_model.generate_next_token(encoded_text) + if ngrams is None: + # raise ValueError('self._language_model._extract_n_grams() returned None') + raise ValueError('self._language_model.generate_next_token() returned None') + for ngram in ngrams: + # l_sum += math.log(freq_dict[ngram]) + l_sum += math.log(ngrams[ngram]) + num_ngrams += 1 + l_sum = -(l_sum / num_ngrams) + result = pow(math.e, l_sum) + return result def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore """ @@ -311,6 +360,31 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: or if sequence has inappropriate length, or if methods used return None. """ + if not seq_len: + raise ValueError('Input argument seq_len is empty') + if not prompt: + raise ValueError('Input argument prompt is empty') + if not isinstance(seq_len, int): + raise ValueError('Inappropriate type argument seq_len') + if not isinstance(prompt, str): + raise ValueError('Inappropriate type argument prompt') + + generators_inv = {value: key for key, value in self._generators.items()} + results_list = [] + + for generator, num_type in generators_inv.items(): + text = generator.run(seq_len, prompt) + if text is None: + raise ValueError(f'{generator} methode run() returned None') + perplexity = self._calculate_perplexity(text) + if perplexity is None: + raise ValueError(f'{generator} perplexity is None') + result = GenerationResultDTO(text, perplexity, num_type) + results_list.append((result, result.get_perplexity(), result.get_type())) + + sorted_results = sorted(results_list, key=lambda tup: (tup[2], tup[1])) + + return [res_tuple[0] for res_tuple in sorted_results] class Examiner: diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index df9b49246..a60d214dc 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,8 +2,9 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, - WordProcessor) +from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator, + NGramLanguageModel, TopPGenerator, + QualityChecker, WordProcessor) def main() -> None: @@ -15,8 +16,15 @@ def main() -> None: word_processor = WordProcessor('') encoded_text = word_processor.encode(text) lang_model = NGramLanguageModel(encoded_text, 2) + lang_model.build() top_p_generator = TopPGenerator(lang_model, word_processor, 0.5) - result = top_p_generator.run(51, 'Vernon') + top_p_result = top_p_generator.run(51, 'Vernon') + generator_types = GeneratorTypes() + # generators = {num: generator_types.get_conversion_generator_type(num) for num in range(1, 3)} + generators = {generator_types.top_p: TopPGenerator(lang_model, word_processor, 0.5), + generator_types.beam_search: BeamSearchTextGenerator(lang_model, word_processor, 5)} + checker = QualityChecker(generators, lang_model, word_processor) + result = checker.run(100, 'The') assert result From b0d5366f685f674fbedda2dbdf39c652413f1dfd Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 14:23:00 +0300 Subject: [PATCH 097/107] step 5 done --- lab_4_fill_words_by_ngrams/main.py | 59 +++++++++++++++++++-- lab_4_fill_words_by_ngrams/start.py | 1 - lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index c14b14b13..4c35c2391 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -9,6 +9,7 @@ from string import punctuation from random import choice import math +import json class WordProcessor(TextProcessor): @@ -205,11 +206,11 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig (str): Name of the generator. """ if generator_type == self.greedy: - return 'Greedy Generator' + return 'greedy generator' if generator_type == self.top_p: - return 'Top-P Generator' + return 'top-p generator' if generator_type == self.beam_search: - return 'Beam Search Generator' + return 'beam search generator' class GenerationResultDTO: @@ -404,6 +405,10 @@ def __init__(self, json_path: str) -> None: Args: json_path (str): Local path to assets file """ + if not isinstance(json_path, str) or not json_path: + raise ValueError + self._json_path = json_path + self._questions_and_answers = {} def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore """ @@ -419,6 +424,16 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore or if attribute _json_path has inappropriate extension, or if inappropriate type loaded data. """ + if not isinstance(self._json_path, str): + raise ValueError('Inappropriate type of attribute _json_path') + if not self._json_path: + raise ValueError('Attribute _json_path is empty') + if self._json_path[-4:] != 'json': + raise ValueError('Attribute _json_path has inappropriate extension') + with open(self._json_path, 'r', encoding="utf-8") as file: + self._questions_and_answers = json.load(file) + if not isinstance(self._questions_and_answers, list): + raise ValueError('Inappropriate type loaded data') def provide_questions(self) -> list[tuple[str, int]]: # type: ignore """ @@ -428,6 +443,8 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore list[tuple[str, int]]: List in the form of [(question, position of the word to be filled)] """ + questions = [(question, place) for (question, place), answer in self._questions_and_answers.items()] + return questions def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore """ @@ -442,6 +459,18 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(answers, dict): + raise ValueError('Inappropriate type input argument') + if not answers: + raise ValueError('Input argument is empty') + num_questions = 0 + score = 0 + right_answers = {question: answer for (question, place), answer in self._questions_and_answers.items()} + for question in answers: + num_questions += 1 + if answers[question] == right_answers[question]: + score += 1 + return score / num_questions class GeneratorRuleStudent: @@ -464,6 +493,13 @@ def __init__( NGramLanguageModel instance to use for text generation word_processor (WordProcessor): WordProcessor instance to handle text processing """ + self._generator_type = generator_type + if generator_type == 0: + self._generator = GreedyTextGenerator(language_model, word_processor) + elif generator_type == 1: + self._generator = TopPGenerator(language_model, word_processor, 0.5) + elif self._generator_type == 2: + self._generator = BeamSearchTextGenerator(language_model, word_processor, 5) def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ignore """ @@ -481,6 +517,21 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig or if input argument is empty, or if methods used return None. """ + if not isinstance(tasks, list): + raise ValueError('Inappropriate type input argument') + if not tasks: + raise ValueError('Input argument is empty') + answers = {} + for (question, place) in tasks: + context = question[:place] + answer = self._generator.run(1, context) + if answer is None: + raise ValueError('self._generator.run() returned None') + if answer[-1] == '.': + answer = answer[:-1] + result = answer + question[place:] + answers[question] = result + return answers def get_generator_type(self) -> str: # type: ignore """ @@ -489,3 +540,5 @@ def get_generator_type(self) -> str: # type: ignore Returns: str: Generator type """ + generator_types = GeneratorTypes() + return generator_types.get_conversion_generator_type(self._generator_type) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index a60d214dc..7a1d57442 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -20,7 +20,6 @@ def main() -> None: top_p_generator = TopPGenerator(lang_model, word_processor, 0.5) top_p_result = top_p_generator.run(51, 'Vernon') generator_types = GeneratorTypes() - # generators = {num: generator_types.get_conversion_generator_type(num) for num in range(1, 3)} generators = {generator_types.top_p: TopPGenerator(lang_model, word_processor, 0.5), generator_types.beam_search: BeamSearchTextGenerator(lang_model, word_processor, 5)} checker = QualityChecker(generators, lang_model, word_processor) diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 1e8b31496..f599e28b8 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -6 +10 From b1366314b6d5ed38129263534f60a090e7be3ce5 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 22:47:07 +0300 Subject: [PATCH 098/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 25 ++++++++----------------- lab_4_fill_words_by_ngrams/start.py | 6 +++--- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 4c35c2391..c542370c1 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -3,13 +3,14 @@ Top-p sampling generation and filling gaps with ngrams """ +import json +import math +from random import choice +from string import punctuation + # pylint:disable=too-few-public-methods, too-many-arguments from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) -from string import punctuation -from random import choice -import math -import json class WordProcessor(TextProcessor): @@ -41,9 +42,6 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore if element in '?!.': preprocessed_text += ' ' preprocessed_text += self.get_end_of_word_token() - elif element.isdigit() or (element.isspace() and len(preprocessed_text) > 1 - and preprocessed_text[-1].isspace()) or element in punctuation: - pass elif element.isalpha() or element.isspace(): preprocessed_text += element @@ -87,8 +85,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # result = '' for word in decoded_corpus: if word == self.get_end_of_word_token(): - result = result[:-1] - result += '.' + result = result[:-1] + '.' else: for letter in word: if not result or (len(result) > 2 and result[-2] == '.'): @@ -96,8 +93,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # else: result += letter result += ' ' - if result[-1] == ' ': - result = result[:-1] + result = result.strip() if result[-1] != '.': result += '.' return result @@ -326,17 +322,12 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore encoded_text = self._word_processor.encode(generated_text) if encoded_text is None: raise ValueError('self._word_processor.encode() returned None') - self._language_model.build() - # freq_dict = self._language_model._n_gram_frequencies l_sum = 0 num_ngrams = 0 - # ngrams = self._language_model._extract_n_grams(encoded_text) ngrams = self._language_model.generate_next_token(encoded_text) if ngrams is None: - # raise ValueError('self._language_model._extract_n_grams() returned None') raise ValueError('self._language_model.generate_next_token() returned None') for ngram in ngrams: - # l_sum += math.log(freq_dict[ngram]) l_sum += math.log(ngrams[ngram]) num_ngrams += 1 l_sum = -(l_sum / num_ngrams) @@ -428,7 +419,7 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore raise ValueError('Inappropriate type of attribute _json_path') if not self._json_path: raise ValueError('Attribute _json_path is empty') - if self._json_path[-4:] != 'json': + if not self._json_path.endswith('json'): raise ValueError('Attribute _json_path has inappropriate extension') with open(self._json_path, 'r', encoding="utf-8") as file: self._questions_and_answers = json.load(file) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 7a1d57442..4baea1ca0 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,9 +2,9 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator, - NGramLanguageModel, TopPGenerator, - QualityChecker, WordProcessor) +from lab_4_fill_words_by_ngrams.main import (BeamSearchTextGenerator, GeneratorTypes, + NGramLanguageModel, QualityChecker, TopPGenerator, + WordProcessor) def main() -> None: From 8d5b3b27d867c7bacc6456f524772a53d2ff9743 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 22:51:20 +0300 Subject: [PATCH 099/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 12 +++++++----- lab_4_fill_words_by_ngrams/start.py | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index c542370c1..ea69bddfc 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -6,7 +6,6 @@ import json import math from random import choice -from string import punctuation # pylint:disable=too-few-public-methods, too-many-arguments from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, @@ -207,6 +206,7 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig return 'top-p generator' if generator_type == self.beam_search: return 'beam search generator' + return None class GenerationResultDTO: @@ -284,7 +284,7 @@ class QualityChecker: """ def __init__( - self, generators: dict, language_model: NGramLanguageModel, word_processor: WordProcessor + self, generators: dict, language_model: NGramLanguageModel, word_processor: WordProcessor ) -> None: """ Initialize an instance of QualityChecker. @@ -434,7 +434,8 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore list[tuple[str, int]]: List in the form of [(question, position of the word to be filled)] """ - questions = [(question, place) for (question, place), answer in self._questions_and_answers.items()] + questions = [(question, place) for (question, place), answer + in self._questions_and_answers.items()] return questions def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore @@ -456,7 +457,8 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore raise ValueError('Input argument is empty') num_questions = 0 score = 0 - right_answers = {question: answer for (question, place), answer in self._questions_and_answers.items()} + right_answers = {question: answer for (question, place), answer + in self._questions_and_answers.items()} for question in answers: num_questions += 1 if answers[question] == right_answers[question]: @@ -473,7 +475,7 @@ class GeneratorRuleStudent: _generator_type: int def __init__( - self, generator_type: int, language_model: NGramLanguageModel, word_processor: WordProcessor + self, generator_type: int, language_model: NGramLanguageModel, word_processor: WordProcessor ) -> None: """ Initialize an instance of GeneratorRuleStudent. diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 4baea1ca0..9ce9ff5c6 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -21,7 +21,8 @@ def main() -> None: top_p_result = top_p_generator.run(51, 'Vernon') generator_types = GeneratorTypes() generators = {generator_types.top_p: TopPGenerator(lang_model, word_processor, 0.5), - generator_types.beam_search: BeamSearchTextGenerator(lang_model, word_processor, 5)} + generator_types.beam_search: BeamSearchTextGenerator(lang_model, + word_processor, 5)} checker = QualityChecker(generators, lang_model, word_processor) result = checker.run(100, 'The') assert result From 965f941d086755f02a43ff26ed7b9e9d2d47a0ce Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 22:55:33 +0300 Subject: [PATCH 100/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index ea69bddfc..1ad018898 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -206,7 +206,7 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig return 'top-p generator' if generator_type == self.beam_search: return 'beam search generator' - return None + return '' class GenerationResultDTO: @@ -322,8 +322,8 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore encoded_text = self._word_processor.encode(generated_text) if encoded_text is None: raise ValueError('self._word_processor.encode() returned None') - l_sum = 0 - num_ngrams = 0 + l_sum = 0.0 + num_ngrams = 0.0 ngrams = self._language_model.generate_next_token(encoded_text) if ngrams is None: raise ValueError('self._language_model.generate_next_token() returned None') @@ -517,7 +517,7 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig answers = {} for (question, place) in tasks: context = question[:place] - answer = self._generator.run(1, context) + answer = self._generator.run(seq_len=1, prompt=context) if answer is None: raise ValueError('self._generator.run() returned None') if answer[-1] == '.': From 5e034c71874d65623f731513f087a13f8b09dd8d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 22:58:23 +0300 Subject: [PATCH 101/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 1ad018898..f9b0b31e9 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -332,7 +332,9 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore num_ngrams += 1 l_sum = -(l_sum / num_ngrams) result = pow(math.e, l_sum) - return result + if type(result) == float: + return result + return 0.0 def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore """ From f07fcd470b982217ab3bd5a8c8af260ebdc94020 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Tue, 12 Dec 2023 23:01:23 +0300 Subject: [PATCH 102/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index f9b0b31e9..9d23bd4ae 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -332,7 +332,7 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore num_ngrams += 1 l_sum = -(l_sum / num_ngrams) result = pow(math.e, l_sum) - if type(result) == float: + if isinstance(result, float): return result return 0.0 From dc81d086117c7119694778dd919bccd6690f3517 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 18 Dec 2023 08:59:08 +0300 Subject: [PATCH 103/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 115 +++++++++++++++++------------ 1 file changed, 68 insertions(+), 47 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 9d23bd4ae..9be7694ce 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -34,17 +34,24 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore """ if not isinstance(text, str) or not text: raise ValueError('Type input is inappropriate or input argument is empty.') - text = repr(text) - text = text.replace('\\n', ' ') - preprocessed_text = '' - for element in text.lower(): - if element in '?!.': - preprocessed_text += ' ' - preprocessed_text += self.get_end_of_word_token() - elif element.isalpha() or element.isspace(): - preprocessed_text += element - - return tuple(preprocessed_text.split(' ')) + + for digit in ('.', '!', '?'): + text = text.replace(digit, f" {self._end_of_word_token} ") + + tokenized_word = [] + for word in text.lower().split(): + if word == self._end_of_word_token or word.isalpha() or word.isspace(): + tokenized_word.append(word) + continue + + clean_word = [] + for alpha in list(word): + if alpha.isalpha(): + clean_word.append(alpha) + if clean_word: + tokenized_word.append("".join(clean_word)) + + return tuple(tokenized_word) def _put(self, element: str) -> None: """ @@ -84,15 +91,14 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # result = '' for word in decoded_corpus: if word == self.get_end_of_word_token(): - result = result[:-1] + '.' + result += '.' + elif not result: + result += word.capitalize() + elif result[-1] == '.': + result += ' ' + word.capitalize() else: - for letter in word: - if not result or (len(result) > 2 and result[-2] == '.'): - result += letter.upper() - else: - result += letter - result += ' ' - result = result.strip() + result += ' ' + word + if result[-1] != '.': result += '.' return result @@ -201,11 +207,11 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig (str): Name of the generator. """ if generator_type == self.greedy: - return 'greedy generator' + return 'Greedy Generator' if generator_type == self.top_p: - return 'top-p generator' + return 'Top-P Generator' if generator_type == self.beam_search: - return 'beam search generator' + return 'Beam Search Generator' return '' @@ -320,21 +326,29 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore if not isinstance(generated_text, str): raise ValueError('Inappropriate type argument') encoded_text = self._word_processor.encode(generated_text) - if encoded_text is None: + if not encoded_text: raise ValueError('self._word_processor.encode() returned None') + ngram_size = self._language_model.get_n_gram_size() l_sum = 0.0 - num_ngrams = 0.0 - ngrams = self._language_model.generate_next_token(encoded_text) - if ngrams is None: - raise ValueError('self._language_model.generate_next_token() returned None') - for ngram in ngrams: - l_sum += math.log(ngrams[ngram]) - num_ngrams += 1 - l_sum = -(l_sum / num_ngrams) - result = pow(math.e, l_sum) - if isinstance(result, float): - return result - return 0.0 + + for index in range(ngram_size - 1, len(encoded_text)): + context = tuple(encoded_text[index - ngram_size + 1: index]) + token = encoded_text[index] + tokens = self._language_model.generate_next_token(context) + + if tokens is None: + raise ValueError('self._language_model.generate_next_token() returned None') + + probability = tokens.get(token) + if probability is None: + continue + + l_sum += math.log(probability) + if not l_sum: + raise ValueError("Probability sum is 0") + + result = math.exp(-l_sum / (len(encoded_text) - ngram_size)) + return result def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore """ @@ -367,7 +381,7 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: results_list = [] for generator, num_type in generators_inv.items(): - text = generator.run(seq_len, prompt) + text = generator.run(seq_len=seq_len, prompt=prompt) if text is None: raise ValueError(f'{generator} methode run() returned None') perplexity = self._calculate_perplexity(text) @@ -423,11 +437,20 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore raise ValueError('Attribute _json_path is empty') if not self._json_path.endswith('json'): raise ValueError('Attribute _json_path has inappropriate extension') + with open(self._json_path, 'r', encoding="utf-8") as file: - self._questions_and_answers = json.load(file) - if not isinstance(self._questions_and_answers, list): + questions = json.load(file) + + if not isinstance(questions, list): raise ValueError('Inappropriate type loaded data') + self._questions_and_answers = { + (dictionary['question'], dictionary['location']): dictionary['answer'] + for dictionary in questions + } + + return self._questions_and_answers + def provide_questions(self) -> list[tuple[str, int]]: # type: ignore """ Provide questions for an exam. @@ -436,8 +459,8 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore list[tuple[str, int]]: List in the form of [(question, position of the word to be filled)] """ - questions = [(question, place) for (question, place), answer - in self._questions_and_answers.items()] + self._load_from_json() + questions = list(self._questions_and_answers.keys()) return questions def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore @@ -489,12 +512,10 @@ def __init__( word_processor (WordProcessor): WordProcessor instance to handle text processing """ self._generator_type = generator_type - if generator_type == 0: - self._generator = GreedyTextGenerator(language_model, word_processor) - elif generator_type == 1: - self._generator = TopPGenerator(language_model, word_processor, 0.5) - elif self._generator_type == 2: - self._generator = BeamSearchTextGenerator(language_model, word_processor, 5) + generators = (GreedyTextGenerator(language_model, word_processor), + TopPGenerator(language_model, word_processor, 0.5), + BeamSearchTextGenerator(language_model, word_processor, 5)) + self._generator = generators[generator_type] def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ignore """ @@ -523,7 +544,7 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig if answer is None: raise ValueError('self._generator.run() returned None') if answer[-1] == '.': - answer = answer[:-1] + answer = answer[:-1] + ' ' result = answer + question[place:] answers[question] = result return answers From 38a30a0dc39ec3266562a46a2238829e1960f829 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 18 Dec 2023 13:58:28 +0300 Subject: [PATCH 104/107] fixing --- lab_4_fill_words_by_ngrams/main.py | 3 +-- lab_4_fill_words_by_ngrams/start.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 9be7694ce..eac8f1206 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -415,7 +415,7 @@ def __init__(self, json_path: str) -> None: if not isinstance(json_path, str) or not json_path: raise ValueError self._json_path = json_path - self._questions_and_answers = {} + self._questions_and_answers = self._load_from_json() def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore """ @@ -459,7 +459,6 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore list[tuple[str, int]]: List in the form of [(question, position of the word to be filled)] """ - self._load_from_json() questions = list(self._questions_and_answers.keys()) return questions diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 9ce9ff5c6..e5873f2e7 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,9 +2,9 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import (BeamSearchTextGenerator, GeneratorTypes, - NGramLanguageModel, QualityChecker, TopPGenerator, - WordProcessor) +from lab_4_fill_words_by_ngrams.main import (BeamSearchTextGenerator, Examiner, GeneratorRuleStudent, + GeneratorTypes, NGramLanguageModel, QualityChecker, + TopPGenerator, WordProcessor) def main() -> None: @@ -24,7 +24,19 @@ def main() -> None: generator_types.beam_search: BeamSearchTextGenerator(lang_model, word_processor, 5)} checker = QualityChecker(generators, lang_model, word_processor) - result = checker.run(100, 'The') + # checker_result = checker.run(100, 'The') + examiner = Examiner('assets/questions_and_answers.json') + students = [GeneratorRuleStudent(generator_types.greedy, lang_model, word_processor), + GeneratorRuleStudent(generator_types.top_p, lang_model, word_processor), + GeneratorRuleStudent(generator_types.beam_search, lang_model, word_processor)] + questions = examiner.provide_questions() + answers = {student: student.take_exam(questions) for student in students} + assessment = {student: examiner.assess_exam(answer) for student, answer in answers.items()} + result = "" + for student, accuracy in assessment.items(): + result += f"Accuracy of student ({student.get_generator_type()}): {accuracy}\n" + print(result) + assert result From e5bfd3330d08274f23f0dbdec09d0e9170eea943 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 18 Dec 2023 14:00:19 +0300 Subject: [PATCH 105/107] fixing --- lab_4_fill_words_by_ngrams/start.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index e5873f2e7..83850435e 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,9 +2,10 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import (BeamSearchTextGenerator, Examiner, GeneratorRuleStudent, - GeneratorTypes, NGramLanguageModel, QualityChecker, - TopPGenerator, WordProcessor) +from lab_4_fill_words_by_ngrams.main import (BeamSearchTextGenerator, Examiner, + GeneratorRuleStudent, GeneratorTypes, + NGramLanguageModel, QualityChecker, TopPGenerator, + WordProcessor) def main() -> None: From 4b624f8ae9a3a688a4f6153f4ff8d99475f41ca7 Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 18 Dec 2023 14:21:50 +0300 Subject: [PATCH 106/107] fixing --- lab_4_fill_words_by_ngrams/start.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index 83850435e..cf3eba47a 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -19,23 +19,24 @@ def main() -> None: lang_model = NGramLanguageModel(encoded_text, 2) lang_model.build() top_p_generator = TopPGenerator(lang_model, word_processor, 0.5) - top_p_result = top_p_generator.run(51, 'Vernon') + generator_types = GeneratorTypes() generators = {generator_types.top_p: TopPGenerator(lang_model, word_processor, 0.5), generator_types.beam_search: BeamSearchTextGenerator(lang_model, word_processor, 5)} checker = QualityChecker(generators, lang_model, word_processor) - # checker_result = checker.run(100, 'The') - examiner = Examiner('assets/questions_and_answers.json') + + examiner = Examiner("/Users/alesamaskovceva/Documents/python/2023-2-level-labs/lab_4_fill_words_by_ngrams" + "/assets/question_and_answers.json") + questions = examiner.provide_questions() students = [GeneratorRuleStudent(generator_types.greedy, lang_model, word_processor), GeneratorRuleStudent(generator_types.top_p, lang_model, word_processor), GeneratorRuleStudent(generator_types.beam_search, lang_model, word_processor)] - questions = examiner.provide_questions() answers = {student: student.take_exam(questions) for student in students} assessment = {student: examiner.assess_exam(answer) for student, answer in answers.items()} result = "" for student, accuracy in assessment.items(): - result += f"Accuracy of student ({student.get_generator_type()}): {accuracy}\n" + result += f"{student.get_generator_type()}: {accuracy}\n" print(result) assert result From a722d587e176816ee54033e02b69cc9af898126d Mon Sep 17 00:00:00 2001 From: alesiamashkovtseva Date: Mon, 18 Dec 2023 14:23:39 +0300 Subject: [PATCH 107/107] fixing --- lab_4_fill_words_by_ngrams/start.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index cf3eba47a..53ce7e9cd 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -26,7 +26,8 @@ def main() -> None: word_processor, 5)} checker = QualityChecker(generators, lang_model, word_processor) - examiner = Examiner("/Users/alesamaskovceva/Documents/python/2023-2-level-labs/lab_4_fill_words_by_ngrams" + examiner = Examiner("/Users/alesamaskovceva/Documents/python" + "/2023-2-level-labs/lab_4_fill_words_by_ngrams" "/assets/question_and_answers.json") questions = examiner.provide_questions() students = [GeneratorRuleStudent(generator_types.greedy, lang_model, word_processor),