From 2b71dbe60816a1b2062fa41238c0e52b2abcf3fc Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 15 Sep 2023 10:45:01 +0300 Subject: [PATCH 01/81] add main --- lab_1_classify_by_unigrams/target_score.txt | 2 +- requirements.txt | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 573541ac9..f599e28b8 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -0 +10 diff --git a/requirements.txt b/requirements.txt index 8b1378917..2c7efb1e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,16 @@ - +ast-comments==1.0.1 +black==22.6.0 +coverage[toml]==6.4.4 +ghapi==0.1.19 +flake8==6.0.0 +flake8-isort==6.0.0 +mypy==1.1.1 +pymarkdownlnt==0.9.9 +pymdown-extensions==9.5 +pydantic==1.10.7 +pylint==2.15.10 +pyspelling==2.7.3 +pytest==6.2.5 +regex==2023.3.23 +typed-argument-parser==1.8.1 +tqdm==4.64.1 From be684a1883f1ef85b141bf9208009bf020619510 Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 15 Sep 2023 12:54:45 +0300 Subject: [PATCH 02/81] add git --- new attempt.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 new attempt.py diff --git a/new attempt.py b/new attempt.py new file mode 100644 index 000000000..3db933357 --- /dev/null +++ b/new attempt.py @@ -0,0 +1 @@ +print(5) From b6368f8d0a3b82a494157f74c1b8ae8e67eeed35 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 20 Sep 2023 12:25:59 +0300 Subject: [PATCH 03/81] git commit --- lab_1_classify_by_unigrams/main.py | 39 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 486b3d65c..195af3fd6 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -2,32 +2,33 @@ Lab 1 Language detection """ - - def tokenize(text: str) -> list[str] | None: - """ - Splits a text into tokens, converts the tokens into lowercase, - removes punctuation, digits and other symbols - :param text: a text - :return: a list of lower-cased tokens without punctuation - """ + text = text.lower() + cleaned_text = "" + for symbol in text: + for letter in symbol: + if letter.isalpha() and symbol != " ": + cleaned_text += letter + tokens = list(cleaned_text) + if not isinstance(text, str): + return None + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: - """ - Calculates frequencies of given tokens - :param tokens: a list of tokens - :return: a dictionary with frequencies - """ + freqs = {} + element_count = len(tokens) + for token in tokens: + if token in freqs: + freqs[token] += 1 / element_count + else: + freqs[token] = 1 / element_count + return freqs def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: - """ - Creates a language profile - :param language: a language - :param text: a text - :return: a dictionary with two keys – name, freq - """ + dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))} + return dict_language_profile def calculate_mse(predicted: list, actual: list) -> float | None: From 323fc7c4e3a9b5d5b8ef569eca1d6f8fcae1608c Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 21 Sep 2023 19:44:39 +0300 Subject: [PATCH 04/81] added fixes --- lab_1_classify_by_unigrams/main.py | 23 ++++++++++++----------- new attempt.py | 1 - 2 files changed, 12 insertions(+), 12 deletions(-) delete mode 100644 new attempt.py diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 195af3fd6..20c57d60c 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -3,26 +3,27 @@ Language detection """ def tokenize(text: str) -> list[str] | None: - text = text.lower() - cleaned_text = "" - for symbol in text: - for letter in symbol: - if letter.isalpha() and symbol != " ": - cleaned_text += letter - tokens = list(cleaned_text) if not isinstance(text, str): return None - return tokens + else: + text = text.lower() + cleaned_text = "" + for symbol in text: + if symbol.isalpha() and symbol != " ": + cleaned_text += symbol + tokens = list(cleaned_text) + return tokens def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: freqs = {} - element_count = len(tokens) for token in tokens: if token in freqs: - freqs[token] += 1 / element_count + freqs[token] += 1 else: - freqs[token] = 1 / element_count + freqs[token] = 1 + for token, freq in freqs: + freqs[token] = freq / len(tokens) return freqs diff --git a/new attempt.py b/new attempt.py deleted file mode 100644 index 3db933357..000000000 --- a/new attempt.py +++ /dev/null @@ -1 +0,0 @@ -print(5) From 01ea1a1de204f7b12b1f95ad1a3bd993ddcd8213 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 27 Sep 2023 21:43:44 +0300 Subject: [PATCH 05/81] add fixes --- lab_1_classify_by_unigrams/main.py | 43 +++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index e91bed965..a368c6819 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -22,7 +22,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: freqs[token] += 1 else: freqs[token] = 1 - for token, freq in freqs: + for token, freq in freqs.items(): freqs[token] = freq / len(tokens) return freqs @@ -39,6 +39,17 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ + count_actual = len(actual) + count_predicted = len(predicted) + summ_values = 0 + if isinstance(actual, list) and isinstance(predicted, list) and count_actual == count_predicted: + squared_difference = [(actual_value - predicted_value)**2 for actual_value, predicted_value in zip(actual,predicted)] + for value in squared_difference: + summ_values += value + mse = round(summ_values / count_actual, 3) + return mse + else: + return None def compare_profiles( @@ -51,6 +62,23 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ + if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict): + values_unknown_profile = unknown_profile['freq'] + values_profile_to_compare = profile_to_compare['freq'] + for letter in values_unknown_profile: + if letter not in values_profile_to_compare: + values_profile_to_compare[letter] = 0 + for letter in values_profile_to_compare: + if letter not in values_unknown_profile: + values_unknown_profile[letter] = 0 + sorted_unknown_profile = dict(sorted(values_unknown_profile.items())) + sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items())) + list_unknown_profile = list(sorted_unknown_profile.values()) + list_profile_to_compare = list(sorted_profile_to_compare.values()) + profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare) + return profile_difference + else: + return None def detect_language( @@ -65,6 +93,19 @@ def detect_language( :param profile_2: a dictionary of a known profile :return: a language """ + if isinstance(unknown_profile, dict) and isinstance(profile_1, dict) and isinstance(profile_2, dict): + mse_profile_1 = compare_profiles(unknown_profile, profile_1) + mse_profile_2 = compare_profiles(unknown_profile, profile_2) + if mse_profile_1 < mse_profile_2: + return profile_1['name'] + elif mse_profile_2 < mse_profile_1: + return profile_2['name'] + else: + str_name_language = sorted(profile_1['name'] + profile_2['name']) + first_name = str_name_language[0] + return first_name + else: + return None def load_profile(path_to_file: str) -> dict | None: From 05842a0d87e025e380a01261b21cbada3b1ff660 Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 29 Sep 2023 09:48:25 +0300 Subject: [PATCH 06/81] i start do lab --- lab_1_classify_by_unigrams/start.py | 9 ++++++--- lab_1_classify_by_unigrams/target_score.txt | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index db7a1a904..fcab139a4 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,19 +1,22 @@ """ Language detection starter """ - - +from lab_1_classify_by_unigrams.main import create_language_profile +from lab_1_classify_by_unigrams.main import detect_language def main() -> None: """ Launches an implementation """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() + en_profile = create_language_profile("en", en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() + de_profile = create_language_profile("de", de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - result = None + unknown_profile = create_language_profile("unk", unknown_text) + result = detect_language(unknown_profile, en_profile, de_profile) assert result, "Detection result is None" diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index f599e28b8..45a4fb75d 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -10 +8 From 5604898e0667a927720b14b7265ed312ab0c4388 Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 29 Sep 2023 09:48:35 +0300 Subject: [PATCH 07/81] i start do lab --- requirements.txt | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2c7efb1e5..e69de29bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +0,0 @@ -ast-comments==1.0.1 -black==22.6.0 -coverage[toml]==6.4.4 -ghapi==0.1.19 -flake8==6.0.0 -flake8-isort==6.0.0 -mypy==1.1.1 -pymarkdownlnt==0.9.9 -pymdown-extensions==9.5 -pydantic==1.10.7 -pylint==2.15.10 -pyspelling==2.7.3 -pytest==6.2.5 -regex==2023.3.23 -typed-argument-parser==1.8.1 -tqdm==4.64.1 From 9af8720803e32981d996dee6252eecf597c581db Mon Sep 17 00:00:00 2001 From: mmarina Date: Mon, 2 Oct 2023 22:34:12 +0300 Subject: [PATCH 08/81] added fixes --- lab_1_classify_by_unigrams/main.py | 114 +++++++++++++++++------------ 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index a368c6819..e34346033 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -2,20 +2,35 @@ Lab 1 Language detection """ + + def tokenize(text: str) -> list[str] | None: + """ + Splits a text into tokens, converts the tokens into lowercase, + removes punctuation, digits and other symbols + :param text: a text + :return: a list of lower-cased tokens without punctuation + """ if not isinstance(text, str): return None - else: - text = text.lower() - cleaned_text = "" - for symbol in text: - if symbol.isalpha() and symbol != " ": - cleaned_text += symbol - tokens = list(cleaned_text) - return tokens + cleaned_text = [] + for symbol in text: + if symbol.isalpha(): + cleaned_text.append(symbol.lower()) + return cleaned_text def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: + """ + Calculates frequencies of given tokens + :param tokens: a list of tokens + :return: a dictionary with frequencies + """ + if not isinstance(tokens, list): + return None + for token in tokens: + if not isinstance(token, str): + return None freqs = {} for token in tokens: if token in freqs: @@ -28,6 +43,14 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: + """ + Creates a language profile + :param language: a language + :param text: a text + :return: a dictionary with two keys – name, freq + """ + if not isinstance(language, str) or not isinstance(text, str): + return None dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))} return dict_language_profile @@ -39,17 +62,13 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - count_actual = len(actual) - count_predicted = len(predicted) - summ_values = 0 - if isinstance(actual, list) and isinstance(predicted, list) and count_actual == count_predicted: - squared_difference = [(actual_value - predicted_value)**2 for actual_value, predicted_value in zip(actual,predicted)] - for value in squared_difference: - summ_values += value - mse = round(summ_values / count_actual, 3) - return mse - else: + if not isinstance(actual, list) or not isinstance(predicted, list) or len(actual) != len(predicted): return None + summ_values = 0 + for i, predicted_value in enumerate(predicted): + summ_values += (actual[i] - predicted_value)**2 + mse = round(summ_values / len(actual), 4) + return mse def compare_profiles( @@ -62,23 +81,25 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ - if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict): - values_unknown_profile = unknown_profile['freq'] - values_profile_to_compare = profile_to_compare['freq'] - for letter in values_unknown_profile: - if letter not in values_profile_to_compare: - values_profile_to_compare[letter] = 0 - for letter in values_profile_to_compare: - if letter not in values_unknown_profile: - values_unknown_profile[letter] = 0 - sorted_unknown_profile = dict(sorted(values_unknown_profile.items())) - sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items())) - list_unknown_profile = list(sorted_unknown_profile.values()) - list_profile_to_compare = list(sorted_profile_to_compare.values()) - profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare) - return profile_difference - else: + if (not isinstance(unknown_profile, dict) or + not isinstance(profile_to_compare, dict) or + 'name' not in unknown_profile or + 'name' not in profile_to_compare): return None + values_unknown_profile = unknown_profile['freq'] + values_profile_to_compare = profile_to_compare['freq'] + for letter in values_unknown_profile: + if letter not in values_profile_to_compare: + values_profile_to_compare[letter] = 0 + for letter in values_profile_to_compare: + if letter not in values_unknown_profile: + values_unknown_profile[letter] = 0 + sorted_unknown_profile = dict(sorted(values_unknown_profile.items())) + sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items())) + list_unknown_profile = list(sorted_unknown_profile.values()) + list_profile_to_compare = list(sorted_profile_to_compare.values()) + profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare) + return profile_difference def detect_language( @@ -93,19 +114,20 @@ def detect_language( :param profile_2: a dictionary of a known profile :return: a language """ - if isinstance(unknown_profile, dict) and isinstance(profile_1, dict) and isinstance(profile_2, dict): - mse_profile_1 = compare_profiles(unknown_profile, profile_1) - mse_profile_2 = compare_profiles(unknown_profile, profile_2) - if mse_profile_1 < mse_profile_2: - return profile_1['name'] - elif mse_profile_2 < mse_profile_1: - return profile_2['name'] - else: - str_name_language = sorted(profile_1['name'] + profile_2['name']) - first_name = str_name_language[0] - return first_name - else: + if (not isinstance(unknown_profile, dict) or + not isinstance(profile_1, dict) or + not isinstance(profile_2, dict)): return None + mse_profile_1 = compare_profiles(unknown_profile, profile_1) + mse_profile_2 = compare_profiles(unknown_profile, profile_2) + if mse_profile_1 < mse_profile_2: + return profile_1['name'] + if mse_profile_2 < mse_profile_1: + return profile_2['name'] + else: + str_name_language = sorted(profile_1['name'] + profile_2['name']) + first_name = str_name_language[0] + return first_name def load_profile(path_to_file: str) -> dict | None: From 4c329e5378500ad1f53b4779ba00e7ddc3c68a55 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 09:16:03 +0300 Subject: [PATCH 09/81] added fixes --- lab_1_classify_by_unigrams/main.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index e34346033..068a46608 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -62,7 +62,9 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - if not isinstance(actual, list) or not isinstance(predicted, list) or len(actual) != len(predicted): + if (not isinstance(actual, list) or + not isinstance(predicted, list) or + len(actual) != len(predicted)): return None summ_values = 0 for i, predicted_value in enumerate(predicted): @@ -120,14 +122,15 @@ def detect_language( return None mse_profile_1 = compare_profiles(unknown_profile, profile_1) mse_profile_2 = compare_profiles(unknown_profile, profile_2) - if mse_profile_1 < mse_profile_2: - return profile_1['name'] - if mse_profile_2 < mse_profile_1: - return profile_2['name'] - else: - str_name_language = sorted(profile_1['name'] + profile_2['name']) - first_name = str_name_language[0] - return first_name + if (isinstance(mse_profile_1, float) + and isinstance(mse_profile_2, float)): + if mse_profile_1 < mse_profile_2: + return profile_1['name'] + if mse_profile_2 < mse_profile_1: + return profile_2['name'] + str_name_language = sorted(profile_1['name'] + profile_2['name']) + first_name = str_name_language[0] + return first_name def load_profile(path_to_file: str) -> dict | None: From 78e5f4a82b4c40b865584762f79d27dabceae5b5 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 09:24:29 +0300 Subject: [PATCH 10/81] added fixes in start --- lab_1_classify_by_unigrams/start.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index fcab139a4..e80b67f86 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,24 +1,27 @@ """ Language detection starter """ -from lab_1_classify_by_unigrams.main import create_language_profile -from lab_1_classify_by_unigrams.main import detect_language +from lab_1_classify_by_unigrams.main import create_language_profile, detect_language + + def main() -> None: """ Launches an implementation """ with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: en_text = file_to_read_en.read() - en_profile = create_language_profile("en", en_text) with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: de_text = file_to_read_de.read() - de_profile = create_language_profile("de", de_text) with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - unknown_profile = create_language_profile("unk", unknown_text) - result = detect_language(unknown_profile, en_profile, de_profile) - assert result, "Detection result is None" + en_profile = create_language_profile('en', en_text) + de_profile = create_language_profile('de', de_text) + unknown_profile = create_language_profile('unknown', unknown_text) + if (isinstance(en_profile, dict) and isinstance(de_profile, dict) and + isinstance(unknown_profile, dict)): + result = detect_language(unknown_profile, en_profile, de_profile) + assert result, "Detection result is None" if __name__ == "__main__": - main() + main() \ No newline at end of file From b8f85b9738c47f1027de430b8e04a397d5029b16 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 10:08:25 +0300 Subject: [PATCH 11/81] added fixes --- lab_1_classify_by_unigrams/main.py | 45 +++++++++++------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 068a46608..eba79d11c 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -51,8 +51,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))} - return dict_language_profile + return {"name": language, "freq": calculate_frequencies(tokenize(text))} def calculate_mse(predicted: list, actual: list) -> float | None: @@ -62,8 +61,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: :param actual: a list of actual values :return: the score """ - if (not isinstance(actual, list) or - not isinstance(predicted, list) or + if (not isinstance(actual, list) or not isinstance(predicted, list) or len(actual) != len(predicted)): return None summ_values = 0 @@ -83,25 +81,17 @@ def compare_profiles( :param profile_to_compare: a dictionary of a profile to compare the unknown profile to :return: the distance between the profiles """ - if (not isinstance(unknown_profile, dict) or - not isinstance(profile_to_compare, dict) or - 'name' not in unknown_profile or - 'name' not in profile_to_compare): + if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or + 'name' not in unknown_profile or 'name' not in profile_to_compare): return None - values_unknown_profile = unknown_profile['freq'] - values_profile_to_compare = profile_to_compare['freq'] - for letter in values_unknown_profile: - if letter not in values_profile_to_compare: - values_profile_to_compare[letter] = 0 - for letter in values_profile_to_compare: - if letter not in values_unknown_profile: - values_unknown_profile[letter] = 0 - sorted_unknown_profile = dict(sorted(values_unknown_profile.items())) - sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items())) - list_unknown_profile = list(sorted_unknown_profile.values()) - list_profile_to_compare = list(sorted_profile_to_compare.values()) - profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare) - return profile_difference + tokens = set(profile_to_compare['freq'].keys()) + tokens.update(unknown_profile['freq'].keys()) + list_unknown_profile = [] + list_profile_to_compare = [] + for letter in tokens: + list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0)) + list_unknown_profile.append(unknown_profile['freq'].get(letter, 0)) + return calculate_mse(list_profile_to_compare, list_unknown_profile) def detect_language( @@ -116,8 +106,7 @@ def detect_language( :param profile_2: a dictionary of a known profile :return: a language """ - if (not isinstance(unknown_profile, dict) or - not isinstance(profile_1, dict) or + if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or not isinstance(profile_2, dict)): return None mse_profile_1 = compare_profiles(unknown_profile, profile_1) @@ -125,12 +114,10 @@ def detect_language( if (isinstance(mse_profile_1, float) and isinstance(mse_profile_2, float)): if mse_profile_1 < mse_profile_2: - return profile_1['name'] + return str(profile_1['name']) if mse_profile_2 < mse_profile_1: - return profile_2['name'] - str_name_language = sorted(profile_1['name'] + profile_2['name']) - first_name = str_name_language[0] - return first_name + return str(profile_2['name']) + return sorted([str(profile_1['name']), str(profile_2['name'])])[0] def load_profile(path_to_file: str) -> dict | None: From b9f756748cf49cba08f2f56fbfaa475336648a08 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 10:10:10 +0300 Subject: [PATCH 12/81] added fixes in start --- lab_1_classify_by_unigrams/start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index e80b67f86..9e9930ece 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -24,4 +24,4 @@ def main() -> None: if __name__ == "__main__": - main() \ No newline at end of file + main() From 85ff8e27fa443e87c3eddc32a0ab4486f3fe911b Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 10:16:29 +0300 Subject: [PATCH 13/81] added fixes --- lab_1_classify_by_unigrams/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index eba79d11c..2fe7aa8a7 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -51,7 +51,10 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st """ if not isinstance(language, str) or not isinstance(text, str): return None - return {"name": language, "freq": calculate_frequencies(tokenize(text))} + values_freq = calculate_frequencies(tokenize(text)) + if not isinstance(values_freq, dict): + return None + return {'name': language, 'freq': values_freq} def calculate_mse(predicted: list, actual: list) -> float | None: From c0b809ffbb6b3c691fd6606f3962fbcfbaca99e1 Mon Sep 17 00:00:00 2001 From: mmarina2004 <134407899+mmarina2004@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:49:01 +0300 Subject: [PATCH 14/81] Delete requirements.txt delete requirements --- requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29bb..000000000 From 9586a872610238ec24059cc8fd55703a613850b6 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 3 Oct 2023 14:09:45 +0300 Subject: [PATCH 15/81] recovery --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index e69de29bb..8b1378917 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ + From 72e3aff37f134850292fec72f55d4cefa989a6ad Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 4 Oct 2023 12:09:40 +0300 Subject: [PATCH 16/81] added fixes for 10 --- lab_1_classify_by_unigrams/main.py | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 2fe7aa8a7..8a997343a 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -3,6 +3,8 @@ Language detection """ +import json + def tokenize(text: str) -> list[str] | None: """ @@ -129,6 +131,13 @@ def load_profile(path_to_file: str) -> dict | None: :param path_to_file: a path to the language profile :return: a dictionary with at least two keys – name, freq """ + if not isinstance(path_to_file, str): + return None + with open(path_to_file, "r", encoding="utf-8") as json_file: + language_profile = json.load(json_file) + if not isinstance(language_profile, dict): + return None + return language_profile def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: @@ -138,6 +147,19 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: :return: a dict with a lower-cased loaded profile with relative frequencies without unnecessary ngrams """ + if (not isinstance(profile, dict) or 'name' not in profile + or 'freq' not in profile or 'n_words' not in profile): + return None + n_words = profile.pop('n_words') + new_freq = {} + for key, value in profile['freq'].items(): + if key.isalpha() and len(key) == 1: + if key.lower() not in new_freq: + new_freq[key.lower()] = value / n_words[0] + else: + new_freq[key.lower()] += value / n_words[0] + processed_profile = {'name': profile['name'], 'freq': new_freq} + return processed_profile def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None: @@ -146,6 +168,17 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, :paths_to_profiles: a list of strings to the profiles :return: a list of loaded profiles """ + if not isinstance(paths_to_profiles, list): + return None + list_processed_profiles = [] + for paths in paths_to_profiles: + if isinstance(paths, str): + language_profile = load_profile(paths) + if isinstance(language_profile, dict): + processed_profile = preprocess_profile(language_profile) + if isinstance(processed_profile, dict): + list_processed_profiles.append(processed_profile) + return list_processed_profiles def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]], @@ -156,6 +189,14 @@ def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]], :param known_profiles: a list of known profiles :return: a sorted list of tuples containing a language and a distance """ + if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list): + return None + list_mse = [] + for profile in known_profiles: + if isinstance(profile, dict): + list_mse.append((profile['name'], compare_profiles(unknown_profile, profile))) + list_mse.sort(key=lambda a: (a[1], a[0])) + return list_mse def print_report(detections: list[tuple[str, float]]) -> None: @@ -163,3 +204,6 @@ def print_report(detections: list[tuple[str, float]]) -> None: Prints report for detection of language :param detections: a list with distances for each available language """ + if isinstance(detections, list): + for profile in detections: + print(f'{profile[0]}: MSE {profile[1]:.5f}') From e90202725bb146e2d19bca918779c80a609772ba Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 4 Oct 2023 12:16:23 +0300 Subject: [PATCH 17/81] mark 10 --- lab_1_classify_by_unigrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt index 45a4fb75d..f599e28b8 100644 --- a/lab_1_classify_by_unigrams/target_score.txt +++ b/lab_1_classify_by_unigrams/target_score.txt @@ -1 +1 @@ -8 +10 From 81a86ba58ddc6b50a25049f3be44277515784fbe Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 4 Oct 2023 22:52:06 +0300 Subject: [PATCH 18/81] added fixes for 10 --- lab_1_classify_by_unigrams/main.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 8a997343a..d6c888daf 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -150,15 +150,13 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: if (not isinstance(profile, dict) or 'name' not in profile or 'freq' not in profile or 'n_words' not in profile): return None - n_words = profile.pop('n_words') new_freq = {} for key, value in profile['freq'].items(): - if key.isalpha() and len(key) == 1: - if key.lower() not in new_freq: - new_freq[key.lower()] = value / n_words[0] - else: - new_freq[key.lower()] += value / n_words[0] - processed_profile = {'name': profile['name'], 'freq': new_freq} + if key.lower() in new_freq: + new_freq[key.lower()] += value / profile["n_words"][0] + elif len(key) == 1: + new_freq[key.lower()] = value / profile["n_words"][0] + processed_profile = {'name': profile.get("name"), 'freq': new_freq} return processed_profile From ae7ffdf577693e9f968302e46fb1085c48ff3f66 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 12:03:37 +0300 Subject: [PATCH 19/81] added fixes --- lab_1_classify_by_unigrams/main.py | 40 ++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index d6c888daf..4733ea691 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -8,26 +8,22 @@ def tokenize(text: str) -> list[str] | None: """ - Splits a text into tokens, converts the tokens into lowercase, - removes punctuation, digits and other symbols - :param text: a text - :return: a list of lower-cased tokens without punctuation - """ + Splits a text into tokens, converts the tokens into lowercase, + removes punctuation, digits and other symbols + :param text: a text + :return: a list of lower-cased tokens without punctuation + """ if not isinstance(text, str): return None - cleaned_text = [] - for symbol in text: - if symbol.isalpha(): - cleaned_text.append(symbol.lower()) - return cleaned_text + return [symbol.lower() for symbol in text if symbol.isalpha()] def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: """ - Calculates frequencies of given tokens - :param tokens: a list of tokens - :return: a dictionary with frequencies - """ + Calculates frequencies of given tokens + :param tokens: a list of tokens + :return: a dictionary with frequencies + """ if not isinstance(tokens, list): return None for token in tokens: @@ -46,11 +42,11 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: """ - Creates a language profile - :param language: a language - :param text: a text - :return: a dictionary with two keys – name, freq - """ + Creates a language profile + :param language: a language + :param text: a text + :return: a dictionary with two keys – name, freq + """ if not isinstance(language, str) or not isinstance(text, str): return None values_freq = calculate_frequencies(tokenize(text)) @@ -70,8 +66,10 @@ def calculate_mse(predicted: list, actual: list) -> float | None: len(actual) != len(predicted)): return None summ_values = 0 - for i, predicted_value in enumerate(predicted): - summ_values += (actual[i] - predicted_value)**2 + squared_difference = ([(actual_value - predicted_value)**2 + for actual_value, predicted_value in zip(actual, predicted)]) + for value in squared_difference: + summ_values += value mse = round(summ_values / len(actual), 4) return mse From b7516ad8d5235f5aa4c771552d84957157d12ccb Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 12:09:31 +0300 Subject: [PATCH 20/81] added fixes --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 4733ea691..e2bcc051a 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -154,7 +154,7 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: new_freq[key.lower()] += value / profile["n_words"][0] elif len(key) == 1: new_freq[key.lower()] = value / profile["n_words"][0] - processed_profile = {'name': profile.get("name"), 'freq': new_freq} + processed_profile = {'name': profile["name"], 'freq': new_freq} return processed_profile From 0fdf5bd8545611abb0304af5bbb3abab6084de25 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 12:48:41 +0300 Subject: [PATCH 21/81] added fixed --- lab_1_classify_by_unigrams/main.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index e2bcc051a..3762b16bf 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -31,10 +31,9 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: return None freqs = {} for token in tokens: - if token in freqs: - freqs[token] += 1 - else: - freqs[token] = 1 + if token not in freqs: + freqs[token] = 0 + freqs[token] += 1 for token, freq in freqs.items(): freqs[token] = freq / len(tokens) return freqs @@ -168,12 +167,11 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, return None list_processed_profiles = [] for paths in paths_to_profiles: - if isinstance(paths, str): - language_profile = load_profile(paths) - if isinstance(language_profile, dict): - processed_profile = preprocess_profile(language_profile) - if isinstance(processed_profile, dict): - list_processed_profiles.append(processed_profile) + language_profile = load_profile(paths) + if isinstance(language_profile, dict): + processed_profile = preprocess_profile(language_profile) + if isinstance(processed_profile, dict): + list_processed_profiles.append(processed_profile) return list_processed_profiles From fd4ce28ad2fb8abdb9a62426b7aee7f3dd9e6396 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 13:08:06 +0300 Subject: [PATCH 22/81] added fixed --- lab_1_classify_by_unigrams/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 3762b16bf..0fadc37ca 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -31,9 +31,10 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: return None freqs = {} for token in tokens: - if token not in freqs: - freqs[token] = 0 - freqs[token] += 1 + if token in freqs: + freqs[token] += 1 + else: + freqs[token] = 1 for token, freq in freqs.items(): freqs[token] = freq / len(tokens) return freqs From 3d3664d22acce3fd206148b247eb3e66b387cc65 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 13:16:07 +0300 Subject: [PATCH 23/81] added fixes --- lab_1_classify_by_unigrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 0fadc37ca..2f8fa9fe2 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -171,8 +171,8 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, language_profile = load_profile(paths) if isinstance(language_profile, dict): processed_profile = preprocess_profile(language_profile) - if isinstance(processed_profile, dict): - list_processed_profiles.append(processed_profile) + if isinstance(processed_profile, dict): + list_processed_profiles.append(processed_profile) return list_processed_profiles From ab3519f67384b2c07b3bbe863da8fa90dc4922c2 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 23:27:17 +0300 Subject: [PATCH 24/81] added fixes --- lab_1_classify_by_unigrams/main.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 2f8fa9fe2..7cb26a06d 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -2,7 +2,6 @@ Lab 1 Language detection """ - import json @@ -70,7 +69,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None: for actual_value, predicted_value in zip(actual, predicted)]) for value in squared_difference: summ_values += value - mse = round(summ_values / len(actual), 4) + mse = summ_values / len(actual) return mse @@ -114,13 +113,14 @@ def detect_language( return None mse_profile_1 = compare_profiles(unknown_profile, profile_1) mse_profile_2 = compare_profiles(unknown_profile, profile_2) - if (isinstance(mse_profile_1, float) - and isinstance(mse_profile_2, float)): - if mse_profile_1 < mse_profile_2: - return str(profile_1['name']) - if mse_profile_2 < mse_profile_1: - return str(profile_2['name']) - return sorted([str(profile_1['name']), str(profile_2['name'])])[0] + if (not isinstance(mse_profile_1, float) + or not isinstance(mse_profile_2, float)): + return None + if mse_profile_1 < mse_profile_2: + return str(profile_1['name']) + if mse_profile_2 < mse_profile_1: + return str(profile_2['name']) + return sorted([profile_1['name'], profile_2['name']])[0] def load_profile(path_to_file: str) -> dict | None: From 0ce585b9b45e699eb9a7118ec37b7cb504b4140d Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 23:32:17 +0300 Subject: [PATCH 25/81] added fixes --- lab_1_classify_by_unigrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 7cb26a06d..1680de454 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -120,7 +120,7 @@ def detect_language( return str(profile_1['name']) if mse_profile_2 < mse_profile_1: return str(profile_2['name']) - return sorted([profile_1['name'], profile_2['name']])[0] + return sorted([str(profile_1['name']), str(profile_2['name'])])[0] def load_profile(path_to_file: str) -> dict | None: From 516127cbc7060414ba11103903cdcdcebd04c3a5 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 5 Oct 2023 23:33:27 +0300 Subject: [PATCH 26/81] start --- lab_1_classify_by_unigrams/start.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 9e9930ece..aee582de4 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,26 +1,27 @@ """ Language detection starter """ -from lab_1_classify_by_unigrams.main import create_language_profile, detect_language +from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile, + detect_language_advanced, print_report) def main() -> None: """ Launches an implementation """ - with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: - en_text = file_to_read_en.read() - with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: - de_text = file_to_read_de.read() with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() - en_profile = create_language_profile('en', en_text) - de_profile = create_language_profile('de', de_text) unknown_profile = create_language_profile('unknown', unknown_text) - if (isinstance(en_profile, dict) and isinstance(de_profile, dict) and - isinstance(unknown_profile, dict)): - result = detect_language(unknown_profile, en_profile, de_profile) - assert result, "Detection result is None" + language_profiles = ['assets/profiles/es.json', 'assets/profiles/de.json', + 'assets/profiles/en.json', 'assets/profiles/fr.json', + 'assets/profiles/it.json', 'assets/profiles/ru.json', + 'assets/profiles/tr.json'] + profiles = collect_profiles(language_profiles) + if isinstance(unknown_profile, dict) and isinstance(profiles, list): + result = detect_language_advanced(unknown_profile, profiles) + if isinstance(result, list): + print_report(result) + assert result, "Detection result is None" if __name__ == "__main__": From 9de3541d65dcc654e0d5305f6f8f6aa888be2eb6 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Wed, 11 Oct 2023 11:02:56 +0300 Subject: [PATCH 27/81] checkout labs from the origin repository --- lab_1_classify_by_unigrams/start.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py index 3531c7385..4a17442d0 100644 --- a/lab_1_classify_by_unigrams/start.py +++ b/lab_1_classify_by_unigrams/start.py @@ -1,8 +1,6 @@ """ Language detection starter """ -from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile, - detect_language_advanced, print_report) from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile, detect_language_advanced, print_report) @@ -12,6 +10,10 @@ def main() -> None: """ Launches an implementation """ + with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en: + en_text = file_to_read_en.read() + with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de: + de_text = file_to_read_de.read() with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk: unknown_text = file_to_read_unk.read() From 813de00b8ed583d6faab8549c3f12c6ee19e9e0a Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 12 Oct 2023 20:56:22 +0300 Subject: [PATCH 28/81] lab2 --- lab_1_classify_by_unigrams/main.py | 204 ----------------------------- 1 file changed, 204 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index 1680de454..e69de29bb 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -1,204 +0,0 @@ -""" -Lab 1 -Language detection -""" -import json - - -def tokenize(text: str) -> list[str] | None: - """ - Splits a text into tokens, converts the tokens into lowercase, - removes punctuation, digits and other symbols - :param text: a text - :return: a list of lower-cased tokens without punctuation - """ - if not isinstance(text, str): - return None - return [symbol.lower() for symbol in text if symbol.isalpha()] - - -def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: - """ - Calculates frequencies of given tokens - :param tokens: a list of tokens - :return: a dictionary with frequencies - """ - if not isinstance(tokens, list): - return None - for token in tokens: - if not isinstance(token, str): - return None - freqs = {} - for token in tokens: - if token in freqs: - freqs[token] += 1 - else: - freqs[token] = 1 - for token, freq in freqs.items(): - freqs[token] = freq / len(tokens) - return freqs - - -def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: - """ - Creates a language profile - :param language: a language - :param text: a text - :return: a dictionary with two keys – name, freq - """ - if not isinstance(language, str) or not isinstance(text, str): - return None - values_freq = calculate_frequencies(tokenize(text)) - if not isinstance(values_freq, dict): - return None - return {'name': language, 'freq': values_freq} - - -def calculate_mse(predicted: list, actual: list) -> float | None: - """ - Calculates mean squared error between predicted and actual values - :param predicted: a list of predicted values - :param actual: a list of actual values - :return: the score - """ - if (not isinstance(actual, list) or not isinstance(predicted, list) or - len(actual) != len(predicted)): - return None - summ_values = 0 - squared_difference = ([(actual_value - predicted_value)**2 - for actual_value, predicted_value in zip(actual, predicted)]) - for value in squared_difference: - summ_values += value - mse = summ_values / len(actual) - return mse - - -def compare_profiles( - unknown_profile: dict[str, str | dict[str, float]], - profile_to_compare: dict[str, str | dict[str, float]] -) -> float | None: - """ - Compares profiles and calculates the distance using symbols - :param unknown_profile: a dictionary of an unknown profile - :param profile_to_compare: a dictionary of a profile to compare the unknown profile to - :return: the distance between the profiles - """ - if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or - 'name' not in unknown_profile or 'name' not in profile_to_compare): - return None - tokens = set(profile_to_compare['freq'].keys()) - tokens.update(unknown_profile['freq'].keys()) - list_unknown_profile = [] - list_profile_to_compare = [] - for letter in tokens: - list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0)) - list_unknown_profile.append(unknown_profile['freq'].get(letter, 0)) - return calculate_mse(list_profile_to_compare, list_unknown_profile) - - -def detect_language( - unknown_profile: dict[str, str | dict[str, float]], - profile_1: dict[str, str | dict[str, float]], - profile_2: dict[str, str | dict[str, float]], -) -> str | None: - """ - Detects the language of an unknown profile - :param unknown_profile: a dictionary of a profile to determine the language of - :param profile_1: a dictionary of a known profile - :param profile_2: a dictionary of a known profile - :return: a language - """ - if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or - not isinstance(profile_2, dict)): - return None - mse_profile_1 = compare_profiles(unknown_profile, profile_1) - mse_profile_2 = compare_profiles(unknown_profile, profile_2) - if (not isinstance(mse_profile_1, float) - or not isinstance(mse_profile_2, float)): - return None - if mse_profile_1 < mse_profile_2: - return str(profile_1['name']) - if mse_profile_2 < mse_profile_1: - return str(profile_2['name']) - return sorted([str(profile_1['name']), str(profile_2['name'])])[0] - - -def load_profile(path_to_file: str) -> dict | None: - """ - Loads a language profile - :param path_to_file: a path to the language profile - :return: a dictionary with at least two keys – name, freq - """ - if not isinstance(path_to_file, str): - return None - with open(path_to_file, "r", encoding="utf-8") as json_file: - language_profile = json.load(json_file) - if not isinstance(language_profile, dict): - return None - return language_profile - - -def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: - """ - Preprocesses profile for a loaded language - :param profile: a loaded profile - :return: a dict with a lower-cased loaded profile - with relative frequencies without unnecessary ngrams - """ - if (not isinstance(profile, dict) or 'name' not in profile - or 'freq' not in profile or 'n_words' not in profile): - return None - new_freq = {} - for key, value in profile['freq'].items(): - if key.lower() in new_freq: - new_freq[key.lower()] += value / profile["n_words"][0] - elif len(key) == 1: - new_freq[key.lower()] = value / profile["n_words"][0] - processed_profile = {'name': profile["name"], 'freq': new_freq} - return processed_profile - - -def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None: - """ - Collects profiles for a given path - :paths_to_profiles: a list of strings to the profiles - :return: a list of loaded profiles - """ - if not isinstance(paths_to_profiles, list): - return None - list_processed_profiles = [] - for paths in paths_to_profiles: - language_profile = load_profile(paths) - if isinstance(language_profile, dict): - processed_profile = preprocess_profile(language_profile) - if isinstance(processed_profile, dict): - list_processed_profiles.append(processed_profile) - return list_processed_profiles - - -def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]], - known_profiles: list) -> list | None: - """ - Detects the language of an unknown profile - :param unknown_profile: a dictionary of a profile to determine the language of - :param known_profiles: a list of known profiles - :return: a sorted list of tuples containing a language and a distance - """ - if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list): - return None - list_mse = [] - for profile in known_profiles: - if isinstance(profile, dict): - list_mse.append((profile['name'], compare_profiles(unknown_profile, profile))) - list_mse.sort(key=lambda a: (a[1], a[0])) - return list_mse - - -def print_report(detections: list[tuple[str, float]]) -> None: - """ - Prints report for detection of language - :param detections: a list with distances for each available language - """ - if isinstance(detections, list): - for profile in detections: - print(f'{profile[0]}: MSE {profile[1]:.5f}') From 3af2ea913e70eb7af396e01ffedd9493a2bda220 Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 13 Oct 2023 09:31:29 +0300 Subject: [PATCH 29/81] i start do lab --- lab_1_classify_by_unigrams/main.py | 204 +++++++++++++++++++++++++++++ requirements_qa.txt | 10 +- seminars/practice_3_lists.py | 60 +++++++-- 3 files changed, 256 insertions(+), 18 deletions(-) diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py index e69de29bb..1680de454 100644 --- a/lab_1_classify_by_unigrams/main.py +++ b/lab_1_classify_by_unigrams/main.py @@ -0,0 +1,204 @@ +""" +Lab 1 +Language detection +""" +import json + + +def tokenize(text: str) -> list[str] | None: + """ + Splits a text into tokens, converts the tokens into lowercase, + removes punctuation, digits and other symbols + :param text: a text + :return: a list of lower-cased tokens without punctuation + """ + if not isinstance(text, str): + return None + return [symbol.lower() for symbol in text if symbol.isalpha()] + + +def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None: + """ + Calculates frequencies of given tokens + :param tokens: a list of tokens + :return: a dictionary with frequencies + """ + if not isinstance(tokens, list): + return None + for token in tokens: + if not isinstance(token, str): + return None + freqs = {} + for token in tokens: + if token in freqs: + freqs[token] += 1 + else: + freqs[token] = 1 + for token, freq in freqs.items(): + freqs[token] = freq / len(tokens) + return freqs + + +def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None: + """ + Creates a language profile + :param language: a language + :param text: a text + :return: a dictionary with two keys – name, freq + """ + if not isinstance(language, str) or not isinstance(text, str): + return None + values_freq = calculate_frequencies(tokenize(text)) + if not isinstance(values_freq, dict): + return None + return {'name': language, 'freq': values_freq} + + +def calculate_mse(predicted: list, actual: list) -> float | None: + """ + Calculates mean squared error between predicted and actual values + :param predicted: a list of predicted values + :param actual: a list of actual values + :return: the score + """ + if (not isinstance(actual, list) or not isinstance(predicted, list) or + len(actual) != len(predicted)): + return None + summ_values = 0 + squared_difference = ([(actual_value - predicted_value)**2 + for actual_value, predicted_value in zip(actual, predicted)]) + for value in squared_difference: + summ_values += value + mse = summ_values / len(actual) + return mse + + +def compare_profiles( + unknown_profile: dict[str, str | dict[str, float]], + profile_to_compare: dict[str, str | dict[str, float]] +) -> float | None: + """ + Compares profiles and calculates the distance using symbols + :param unknown_profile: a dictionary of an unknown profile + :param profile_to_compare: a dictionary of a profile to compare the unknown profile to + :return: the distance between the profiles + """ + if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or + 'name' not in unknown_profile or 'name' not in profile_to_compare): + return None + tokens = set(profile_to_compare['freq'].keys()) + tokens.update(unknown_profile['freq'].keys()) + list_unknown_profile = [] + list_profile_to_compare = [] + for letter in tokens: + list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0)) + list_unknown_profile.append(unknown_profile['freq'].get(letter, 0)) + return calculate_mse(list_profile_to_compare, list_unknown_profile) + + +def detect_language( + unknown_profile: dict[str, str | dict[str, float]], + profile_1: dict[str, str | dict[str, float]], + profile_2: dict[str, str | dict[str, float]], +) -> str | None: + """ + Detects the language of an unknown profile + :param unknown_profile: a dictionary of a profile to determine the language of + :param profile_1: a dictionary of a known profile + :param profile_2: a dictionary of a known profile + :return: a language + """ + if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or + not isinstance(profile_2, dict)): + return None + mse_profile_1 = compare_profiles(unknown_profile, profile_1) + mse_profile_2 = compare_profiles(unknown_profile, profile_2) + if (not isinstance(mse_profile_1, float) + or not isinstance(mse_profile_2, float)): + return None + if mse_profile_1 < mse_profile_2: + return str(profile_1['name']) + if mse_profile_2 < mse_profile_1: + return str(profile_2['name']) + return sorted([str(profile_1['name']), str(profile_2['name'])])[0] + + +def load_profile(path_to_file: str) -> dict | None: + """ + Loads a language profile + :param path_to_file: a path to the language profile + :return: a dictionary with at least two keys – name, freq + """ + if not isinstance(path_to_file, str): + return None + with open(path_to_file, "r", encoding="utf-8") as json_file: + language_profile = json.load(json_file) + if not isinstance(language_profile, dict): + return None + return language_profile + + +def preprocess_profile(profile: dict) -> dict[str, str | dict] | None: + """ + Preprocesses profile for a loaded language + :param profile: a loaded profile + :return: a dict with a lower-cased loaded profile + with relative frequencies without unnecessary ngrams + """ + if (not isinstance(profile, dict) or 'name' not in profile + or 'freq' not in profile or 'n_words' not in profile): + return None + new_freq = {} + for key, value in profile['freq'].items(): + if key.lower() in new_freq: + new_freq[key.lower()] += value / profile["n_words"][0] + elif len(key) == 1: + new_freq[key.lower()] = value / profile["n_words"][0] + processed_profile = {'name': profile["name"], 'freq': new_freq} + return processed_profile + + +def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None: + """ + Collects profiles for a given path + :paths_to_profiles: a list of strings to the profiles + :return: a list of loaded profiles + """ + if not isinstance(paths_to_profiles, list): + return None + list_processed_profiles = [] + for paths in paths_to_profiles: + language_profile = load_profile(paths) + if isinstance(language_profile, dict): + processed_profile = preprocess_profile(language_profile) + if isinstance(processed_profile, dict): + list_processed_profiles.append(processed_profile) + return list_processed_profiles + + +def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]], + known_profiles: list) -> list | None: + """ + Detects the language of an unknown profile + :param unknown_profile: a dictionary of a profile to determine the language of + :param known_profiles: a list of known profiles + :return: a sorted list of tuples containing a language and a distance + """ + if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list): + return None + list_mse = [] + for profile in known_profiles: + if isinstance(profile, dict): + list_mse.append((profile['name'], compare_profiles(unknown_profile, profile))) + list_mse.sort(key=lambda a: (a[1], a[0])) + return list_mse + + +def print_report(detections: list[tuple[str, float]]) -> None: + """ + Prints report for detection of language + :param detections: a list with distances for each available language + """ + if isinstance(detections, list): + for profile in detections: + print(f'{profile[0]}: MSE {profile[1]:.5f}') diff --git a/requirements_qa.txt b/requirements_qa.txt index 2c7efb1e5..b165f0d32 100644 --- a/requirements_qa.txt +++ b/requirements_qa.txt @@ -1,16 +1,16 @@ ast-comments==1.0.1 black==22.6.0 coverage[toml]==6.4.4 -ghapi==0.1.19 -flake8==6.0.0 flake8-isort==6.0.0 +flake8==6.0.0 +ghapi==0.1.19 mypy==1.1.1 -pymarkdownlnt==0.9.9 -pymdown-extensions==9.5 pydantic==1.10.7 pylint==2.15.10 +pymarkdownlnt==0.9.9 +pymdown-extensions==9.5 pyspelling==2.7.3 pytest==6.2.5 regex==2023.3.23 -typed-argument-parser==1.8.1 tqdm==4.64.1 +typed-argument-parser==1.8.1 \ No newline at end of file diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py index 915cdb41f..600b9f265 100644 --- a/seminars/practice_3_lists.py +++ b/seminars/practice_3_lists.py @@ -58,11 +58,16 @@ def count_evens(nums: list) -> int: """ Return the number of even ints in the given array. """ - # student realization goes here + n = 0 + for element in nums: + if element % 2 == 0: + n += 1 + return n + # Function calls with expected result: -# count_evens([2, 1, 2, 3, 4]) → 3 +count_evens([2, 1, 2, 3, 4]) # count_evens([2, 2, 0]) → 3 # count_evens([1, 3, 5]) → 0 @@ -75,12 +80,16 @@ def sum13(nums: list) -> int: so it does not count and numbers that come after a 13 also do not count. """ - # student realization goes here + summ = 0 + for element in nums: + if element != 13: + summ += element + print(summ) # Function calls with expected result: # sum13([1, 2, 2, 1]) → 6 # sum13([1, 1]) → 2 -# sum13([1, 2, 2, 1, 13]) → 6 +sum13([1, 2, 2, 1, 13]) # sum13([1, 2, 2, 1, 13, 5, 6]) → 6 @@ -93,11 +102,11 @@ def sum67(nums: list) -> int: (every 6 will be followed by at least one 7). Return 0 for no numbers. """ - # student realization goes here + # Function calls with expected result: -# sum67([1, 2, 2]) → 5 -# sum67([1, 2, 2, 6, 99, 99, 7]) → 5 +sum67([1, 2, 2]) +print(sum67([1, 2, 2, 6, 99, 99, 7])) # sum67([1, 1, 6, 7, 2]) → 4 @@ -108,10 +117,13 @@ def create_phone_number(nums: list) -> str: Write a function that accepts an array of 10 integers (between 0 and 9), that returns a string of those numbers in the form of a phone number. """ - # student realization goes here + number = ''.join(str(a) for a in nums) + phone = f'({number[:3]} {number[3:6]}-{number[6:]})' + return phone + # Function calls with expected result: -# create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0]) +print(create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])) # => returns "(123) 456-7890" @@ -129,10 +141,22 @@ def check_exam(correct_answers: list, student_answers: list) -> int: and +0 for each blank answer, represented as an empty string. If the score < 0, return 0. """ - # student realization goes here + score = 0 + for i, answer in enumerate(student_answers): + if answer == correct_answers[i]: + score += 4 + elif answer != correct_answers[i]: + score -= 1 + elif answer == ' ': + score += 0 + if score < 0: + return 0 + else: + return score + # Function calls with expected result: -# check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]) → 6 +print(check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"])) # check_exam(["a", "a", "c", "b"], ["a", "a", "b", ""]) → 7 # check_exam(["a", "a", "b", "c"], ["a", "a", "b", "c"]) → 16 # check_exam(["b", "c", "b", "a"], ["", "a", "a", "c"]) → 0 @@ -146,14 +170,24 @@ def who_likes_it(names: list) -> str: People can "like" blog posts, pictures or other items. We want to create the text that should be displayed next to such an item. """ - # student realization goes here + if names == []: + return "no one likes this" + if len(names) == 1: + return f'{names[0]} likes this' + elif len(names) == 2: + return f'{names[0]} and {names[1]} like this' + elif len(names) == 3: + return f'{names[0]}, {names[1]} and {names[2]} like this' + else: + return f'{names[0]}, {names[1]} and {len(names) - 2} others like this' + # Function calls with expected result: # [] --> "no one likes this" # ["Peter"] --> "Peter likes this" # ["Jacob", "Alex"] --> "Jacob and Alex like this" # ["Max", "John", "Mark"] --> "Max, John and Mark like this" -# ["Alex", "Jacob", "Mark", "Max"] --> "Alex, Jacob and 2 others like this" +print(who_likes_it(["Alex", "Jacob", "Mark", "Max"])) # Task 7 From 5186fbc6525ea4ac668b6140e21f72865f08b76d Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 20 Oct 2023 12:52:33 +0300 Subject: [PATCH 30/81] 1 func --- lab_2_tokenize_by_bpe/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 620a4d645..1b9558482 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -14,6 +14,8 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ + if not isinstance(raw_word, str) or not isinstance(start_of_word, str or None) or not isinstance(end_of_word, str or None): + return None def collect_frequencies( From 3f6df3fd9130cd75122a30465b670cd2d5ad4b6c Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 25 Oct 2023 22:57:11 +0300 Subject: [PATCH 31/81] change for 6 --- lab_2_tokenize_by_bpe/main.py | 59 +++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 1b9558482..c18626eab 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -14,8 +14,14 @@ def prepare_word( :param end_of_word: a token that signifies the end of word :return: preprocessed word """ - if not isinstance(raw_word, str) or not isinstance(start_of_word, str or None) or not isinstance(end_of_word, str or None): + if not isinstance(raw_word, str) or not (isinstance( + start_of_word, str) or start_of_word is None) or not ( + isinstance(end_of_word, str) or end_of_word is None): return None + tokenized_word = [start_of_word if start_of_word is not None else []] + tokenized_word.extend(element for element in raw_word) + tokenized_word.extend(end_of_word if end_of_word is not None else []) + return tuple(tokenized_word) def collect_frequencies( @@ -28,6 +34,16 @@ def collect_frequencies( :param end_of_word: a token that signifies the end of word :return: dictionary in the form of """ + if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( + isinstance(start_of_word, str) or start_of_word is None): + return None + frequencies_dict = {} + for word in text.split(): + tokenized_word = prepare_word(word, start_of_word if start_of_word is not None else [], end_of_word) + if tokenized_word is None: + return None + frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1 + return frequencies_dict def count_tokens_pairs( @@ -38,6 +54,14 @@ def count_tokens_pairs( :param word_frequencies: dictionary in the form of :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict): + return None + pairs_of_tokens = {} + for tokens, count in word_frequencies.items(): + for i in range(len(tokens) - 1): + pair = (tokens[i], tokens[i + 1]) + pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1 + return pairs_of_tokens def merge_tokens( @@ -49,6 +73,19 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ + if not (isinstance(word_frequencies, dict) + and isinstance(pair, tuple)): + return None + merged_frequencies = {} + for preprocessed_word, count in word_frequencies.items(): + if ''.join(pair) in ''.join(preprocessed_word): + preprocessed_word = list(preprocessed_word) + index = preprocessed_word.index(pair[0]) + preprocessed_word[index] = pair[0] + pair[1] + preprocessed_word.pop(index + 1) + preprocessed_word = tuple(preprocessed_word) + merged_frequencies[preprocessed_word] = count + return merged_frequencies def train( @@ -60,8 +97,24 @@ def train( :param num_merges: required number of new tokens :return: dictionary in the form of """ - - + if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): + return None + while num_merges > 0: + pairs_of_tokens = count_tokens_pairs(word_frequencies) + if pairs_of_tokens is None: + return None + if num_merges > len(pairs_of_tokens): + num_merges = len(pairs_of_tokens) + sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if frequency == + max(pairs_of_tokens.values())]) + sorted_pairs.sort(key=lambda x: (-len(x), x)) + word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) + if word_frequencies is None: + return None + num_merges -= 1 + return word_frequencies + +print(train({('a', 'b'): 3, ('b', 'cd'): 3, ('b', 'ca'): 3}, 2)) def get_vocabulary( word_frequencies: dict[tuple[str, ...], int], unknown_token: str ) -> dict[str, int] | None: From 182aa44d1960f76124d124d58a7b16c7b41849ac Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 25 Oct 2023 22:59:01 +0300 Subject: [PATCH 32/81] score --- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 573541ac9..1e8b31496 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -0 +6 From 07eb01adba6ad3643f7355562feeb6bcdee160f0 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 25 Oct 2023 23:10:22 +0300 Subject: [PATCH 33/81] added fixes --- lab_2_tokenize_by_bpe/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index c18626eab..f4229b201 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -58,8 +58,8 @@ def count_tokens_pairs( return None pairs_of_tokens = {} for tokens, count in word_frequencies.items(): - for i in range(len(tokens) - 1): - pair = (tokens[i], tokens[i + 1]) + for index in range(len(tokens) - 1): + pair = (tokens[index], tokens[index + 1]) pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1 return pairs_of_tokens @@ -114,7 +114,7 @@ def train( num_merges -= 1 return word_frequencies -print(train({('a', 'b'): 3, ('b', 'cd'): 3, ('b', 'ca'): 3}, 2)) + def get_vocabulary( word_frequencies: dict[tuple[str, ...], int], unknown_token: str ) -> dict[str, int] | None: From bb6b622e00c178e5819839f5a0a0fbf7edd04329 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 26 Oct 2023 13:54:22 +0300 Subject: [PATCH 34/81] change for 8 --- lab_2_tokenize_by_bpe/main.py | 52 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index f4229b201..29b138efc 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -18,9 +18,12 @@ def prepare_word( start_of_word, str) or start_of_word is None) or not ( isinstance(end_of_word, str) or end_of_word is None): return None - tokenized_word = [start_of_word if start_of_word is not None else []] + tokenized_word = [] + if start_of_word is not None: + tokenized_word.append(start_of_word) tokenized_word.extend(element for element in raw_word) - tokenized_word.extend(end_of_word if end_of_word is not None else []) + if end_of_word is not None: + tokenized_word.append(end_of_word) return tuple(tokenized_word) @@ -39,10 +42,13 @@ def collect_frequencies( return None frequencies_dict = {} for word in text.split(): - tokenized_word = prepare_word(word, start_of_word if start_of_word is not None else [], end_of_word) - if tokenized_word is None: - return None - frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1 + if start_of_word is not None: + tokenized_word = prepare_word(word, start_of_word, end_of_word) + if start_of_word is None: + tokenized_word = prepare_word(word, None, end_of_word) + if tokenized_word is None: + return None + frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1 return frequencies_dict @@ -57,10 +63,10 @@ def count_tokens_pairs( if not isinstance(word_frequencies, dict): return None pairs_of_tokens = {} - for tokens, count in word_frequencies.items(): + for tokens in word_frequencies: for index in range(len(tokens) - 1): pair = (tokens[index], tokens[index + 1]) - pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1 + pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens] return pairs_of_tokens @@ -105,8 +111,8 @@ def train( return None if num_merges > len(pairs_of_tokens): num_merges = len(pairs_of_tokens) - sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if frequency == - max(pairs_of_tokens.values())]) + sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() + if frequency == max(pairs_of_tokens.values())]) sorted_pairs.sort(key=lambda x: (-len(x), x)) word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) if word_frequencies is None: @@ -124,6 +130,20 @@ def get_vocabulary( :param unknown_token: a token to signify an unknown token :return: dictionary in the form of """ + if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): + return None + tokens_list = set() + dict_token_identifier = {} + for tuples in word_frequencies: + for token in tuples: + tokens_list.add(token) + for element in token: + tokens_list.add(element) + tokens_list.add(unknown_token) + sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x)) + for index, token in enumerate(sorted_tokens): + dict_token_identifier[token] = index + return dict_token_identifier def decode( @@ -136,6 +156,18 @@ def decode( :param end_of_word_token: an end-of-word token :return: decoded sequence """ + if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None): + return None + decoded_tokens = [] + for index in encoded_text: + for token, token_index in vocabulary.items(): + if token_index == index and end_of_word_token is not None: + decoded_tokens.append(' ' if token == end_of_word_token else token) + if vocabulary[token] == index and end_of_word_token is None: + decoded_tokens.append('' if token == end_of_word_token else token) + decoded_text = ''.join(decoded_tokens) + return decoded_text def tokenize_word( From 146a4e2e26ae2a8b61d418777f65123517a1e2f6 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 26 Oct 2023 13:56:53 +0300 Subject: [PATCH 35/81] score 8 --- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -6 +8 From 4e5bc343e4d1f3a8fc96de923e7ae105b5ca4a7e Mon Sep 17 00:00:00 2001 From: mmarina Date: Sat, 28 Oct 2023 23:32:31 +0300 Subject: [PATCH 36/81] change for 10 --- lab_2_tokenize_by_bpe/main.py | 112 ++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 29b138efc..18172301d 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -2,6 +2,7 @@ Lab 2 BPE and machine translation evaluation """ +import json def prepare_word( @@ -85,12 +86,16 @@ def merge_tokens( merged_frequencies = {} for preprocessed_word, count in word_frequencies.items(): if ''.join(pair) in ''.join(preprocessed_word): - preprocessed_word = list(preprocessed_word) - index = preprocessed_word.index(pair[0]) - preprocessed_word[index] = pair[0] + pair[1] - preprocessed_word.pop(index + 1) - preprocessed_word = tuple(preprocessed_word) - merged_frequencies[preprocessed_word] = count + list_word = list(preprocessed_word) + for index in range(len(list_word) - 1): + if (list_word[index], list_word[index + 1]) == pair: + list_word[index + 1] = pair[0] + pair[1] + list_word[index] = '' + if '' in list_word: + list_word.remove('') + merged_frequencies[tuple(list_word)] = count + else: + merged_frequencies[preprocessed_word] = count return merged_frequencies @@ -107,15 +112,15 @@ def train( return None while num_merges > 0: pairs_of_tokens = count_tokens_pairs(word_frequencies) - if pairs_of_tokens is None: + if not pairs_of_tokens: return None if num_merges > len(pairs_of_tokens): num_merges = len(pairs_of_tokens) - sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() - if frequency == max(pairs_of_tokens.values())]) - sorted_pairs.sort(key=lambda x: (-len(x), x)) + pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if + frequency == max(pairs_of_tokens.values())]) + sorted_pairs = sorted([pair for pair in pairs_max_values], key=lambda pair: (-len(str(pair)), pair)) word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) - if word_frequencies is None: + if not word_frequencies: return None num_merges -= 1 return word_frequencies @@ -181,6 +186,26 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word) + or not isinstance(vocabulary, dict) or not isinstance( + end_of_word, (str, type(None))) or not isinstance(unknown_token, str)): + return None + tokens_identifiers = [] + i = 0 + while i < len(word): + max_length_token = '' + for j in range(len(word), i, -1): + current_token = "".join(word[i:j]) + if current_token in vocabulary and len(current_token) > len(max_length_token): + max_length_token = current_token + if max_length_token: + tokens_identifiers.append(vocabulary[max_length_token]) + i += len(max_length_token) + else: + if unknown_token in vocabulary: + tokens_identifiers.append(vocabulary[unknown_token]) + i += 1 + return tokens_identifiers def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -189,6 +214,13 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: :param vocab_path: path to the saved vocabulary :return: dictionary in the form of """ + if not isinstance(vocab_path, str): + return None + with open(vocab_path, 'r', encoding='utf-8') as f: + vocabulary = json.load(f) + if not isinstance(vocabulary, dict): + return None + return vocabulary def encode( @@ -207,6 +239,20 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ + if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance( + unknown_token, str): + return None + list_token_identifiers = [] + text = original_text.split() + for word in text: + prepared_word = prepare_word(word, start_of_word_token, end_of_word_token) + if not prepared_word: + return None + tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token) + if not tokens_id: + return None + list_token_identifiers.extend(tokens_id) + return list_token_identifiers def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -216,6 +262,12 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: :param order: required number of elements in a single n-gram :return: sequence of n-grams """ + if not isinstance(text, str) or not isinstance(order, int): + return None + sequence_ngrams = [] + for index in range(len(text) + 1 - order): + sequence_ngrams.append(tuple(text[index:order+index])) + return sequence_ngrams def calculate_precision( @@ -227,6 +279,14 @@ def calculate_precision( :param reference: expected sequence of n-grams :return: value of Precision metric """ + if not isinstance(actual, list) or not isinstance(reference, list): + return None + if len(actual) == 0: + return 0.0 + unique_reference = set(reference) + identical_tokens = [token for token in unique_reference if token in actual] + precision = len(identical_tokens) / len(unique_reference) + return precision def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -236,6 +296,15 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: :param max_order: maximum length of n-gram considered :return: value of geometric mean of Precision metric """ + if not isinstance(precisions, list) or not isinstance(max_order, int): + return None + if not precisions or max_order <= 0: + return None + all_precision = 1.0 + for precision in precisions: + all_precision *= precision + geometric_mean = all_precision**(1.0 / max_order) + return geometric_mean def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -246,3 +315,24 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ + if not isinstance(actual, str) or not isinstance(reference, str) or not isinstance(max_order, int): + return None + all_ngrams_actual = [] + all_ngrams_reference = [] + for order in range(max_order): + ngrams_actual = collect_ngrams(actual, order + 1) + ngrams_reference = collect_ngrams(reference, order + 1) + if not ngrams_actual or not ngrams_reference: + return None + all_ngrams_actual.append(ngrams_actual) + all_ngrams_reference.append(ngrams_reference) + precisions = [] + for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference): + presision = calculate_precision(ngrams_actual, ngrams_reference) + if not presision: + return None + precisions.append(presision) + blue_metric = geo_mean(precisions, max_order) + if blue_metric is None: + return None + return blue_metric * 100 From a9de02a2a4fd1938d942cb42cf0f575249ca5821 Mon Sep 17 00:00:00 2001 From: mmarina Date: Sat, 28 Oct 2023 23:34:38 +0300 Subject: [PATCH 37/81] score 10 --- lab_2_tokenize_by_bpe/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt index 45a4fb75d..f599e28b8 100644 --- a/lab_2_tokenize_by_bpe/target_score.txt +++ b/lab_2_tokenize_by_bpe/target_score.txt @@ -1 +1 @@ -8 +10 From f6ff2bb4ef757823ab2711acf2d90c20cbe97ce8 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 1 Nov 2023 20:13:34 +0300 Subject: [PATCH 38/81] revert practice --- seminars/practice_3_lists.py | 62 ++++++++---------------------------- 1 file changed, 14 insertions(+), 48 deletions(-) diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py index 600b9f265..7301764b1 100644 --- a/seminars/practice_3_lists.py +++ b/seminars/practice_3_lists.py @@ -58,16 +58,11 @@ def count_evens(nums: list) -> int: """ Return the number of even ints in the given array. """ - n = 0 - for element in nums: - if element % 2 == 0: - n += 1 - return n - + # student realization goes here # Function calls with expected result: -count_evens([2, 1, 2, 3, 4]) +# count_evens([2, 1, 2, 3, 4]) → 3 # count_evens([2, 2, 0]) → 3 # count_evens([1, 3, 5]) → 0 @@ -80,16 +75,12 @@ def sum13(nums: list) -> int: so it does not count and numbers that come after a 13 also do not count. """ - summ = 0 - for element in nums: - if element != 13: - summ += element - print(summ) + # student realization goes here # Function calls with expected result: # sum13([1, 2, 2, 1]) → 6 # sum13([1, 1]) → 2 -sum13([1, 2, 2, 1, 13]) +# sum13([1, 2, 2, 1, 13]) → 6 # sum13([1, 2, 2, 1, 13, 5, 6]) → 6 @@ -102,11 +93,11 @@ def sum67(nums: list) -> int: (every 6 will be followed by at least one 7). Return 0 for no numbers. """ - + # student realization goes here # Function calls with expected result: -sum67([1, 2, 2]) -print(sum67([1, 2, 2, 6, 99, 99, 7])) +# sum67([1, 2, 2]) → 5 +# sum67([1, 2, 2, 6, 99, 99, 7]) → 5 # sum67([1, 1, 6, 7, 2]) → 4 @@ -117,13 +108,10 @@ def create_phone_number(nums: list) -> str: Write a function that accepts an array of 10 integers (between 0 and 9), that returns a string of those numbers in the form of a phone number. """ - number = ''.join(str(a) for a in nums) - phone = f'({number[:3]} {number[3:6]}-{number[6:]})' - return phone - + # student realization goes here # Function calls with expected result: -print(create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])) +# create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0]) # => returns "(123) 456-7890" @@ -141,22 +129,10 @@ def check_exam(correct_answers: list, student_answers: list) -> int: and +0 for each blank answer, represented as an empty string. If the score < 0, return 0. """ - score = 0 - for i, answer in enumerate(student_answers): - if answer == correct_answers[i]: - score += 4 - elif answer != correct_answers[i]: - score -= 1 - elif answer == ' ': - score += 0 - if score < 0: - return 0 - else: - return score - + # student realization goes here # Function calls with expected result: -print(check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"])) +# check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]) → 6 # check_exam(["a", "a", "c", "b"], ["a", "a", "b", ""]) → 7 # check_exam(["a", "a", "b", "c"], ["a", "a", "b", "c"]) → 16 # check_exam(["b", "c", "b", "a"], ["", "a", "a", "c"]) → 0 @@ -170,24 +146,14 @@ def who_likes_it(names: list) -> str: People can "like" blog posts, pictures or other items. We want to create the text that should be displayed next to such an item. """ - if names == []: - return "no one likes this" - if len(names) == 1: - return f'{names[0]} likes this' - elif len(names) == 2: - return f'{names[0]} and {names[1]} like this' - elif len(names) == 3: - return f'{names[0]}, {names[1]} and {names[2]} like this' - else: - return f'{names[0]}, {names[1]} and {len(names) - 2} others like this' - + # student realization goes here # Function calls with expected result: # [] --> "no one likes this" # ["Peter"] --> "Peter likes this" # ["Jacob", "Alex"] --> "Jacob and Alex like this" # ["Max", "John", "Mark"] --> "Max, John and Mark like this" -print(who_likes_it(["Alex", "Jacob", "Mark", "Max"])) +# ["Alex", "Jacob", "Mark", "Max"] --> "Alex, Jacob and 2 others like this" # Task 7 @@ -222,4 +188,4 @@ def scramble(words: list) -> bool: # Function calls with expected result: # scramble(['rkqodlw', 'world']) ==> True # scramble(['cedewaraaossoqqyt', 'codewars']) ==> True -# scramble(['katas', 'steak']) ==> False +# scramble(['katas', 'steak']) ==> False \ No newline at end of file From 006ec4f89cc7ed3ec768bd623674b312bf8fb938 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 1 Nov 2023 20:17:43 +0300 Subject: [PATCH 39/81] revert practice --- seminars/practice_3_lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py index 7301764b1..915cdb41f 100644 --- a/seminars/practice_3_lists.py +++ b/seminars/practice_3_lists.py @@ -188,4 +188,4 @@ def scramble(words: list) -> bool: # Function calls with expected result: # scramble(['rkqodlw', 'world']) ==> True # scramble(['cedewaraaossoqqyt', 'codewars']) ==> True -# scramble(['katas', 'steak']) ==> False \ No newline at end of file +# scramble(['katas', 'steak']) ==> False From f2d0f46f5af0f001cd11951ce9aeebe758e9c2a5 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 1 Nov 2023 21:17:28 +0300 Subject: [PATCH 40/81] changes for checks --- lab_2_tokenize_by_bpe/main.py | 45 +++++++++++++++-------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 18172301d..72b67cbf7 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -20,10 +20,10 @@ def prepare_word( isinstance(end_of_word, str) or end_of_word is None): return None tokenized_word = [] - if start_of_word is not None: + if start_of_word: tokenized_word.append(start_of_word) - tokenized_word.extend(element for element in raw_word) - if end_of_word is not None: + tokenized_word.extend(raw_word) + if end_of_word: tokenized_word.append(end_of_word) return tuple(tokenized_word) @@ -93,9 +93,8 @@ def merge_tokens( list_word[index] = '' if '' in list_word: list_word.remove('') - merged_frequencies[tuple(list_word)] = count - else: - merged_frequencies[preprocessed_word] = count + preprocessed_word = tuple(list_word) + merged_frequencies[preprocessed_word] = count return merged_frequencies @@ -118,7 +117,8 @@ def train( num_merges = len(pairs_of_tokens) pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if frequency == max(pairs_of_tokens.values())]) - sorted_pairs = sorted([pair for pair in pairs_max_values], key=lambda pair: (-len(str(pair)), pair)) + sorted_pairs = (sorted(pairs_max_values, + key=lambda pair: (-len(str(pair)), pair))) word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) if not word_frequencies: return None @@ -190,22 +190,15 @@ def tokenize_word( or not isinstance(vocabulary, dict) or not isinstance( end_of_word, (str, type(None))) or not isinstance(unknown_token, str)): return None - tokens_identifiers = [] - i = 0 - while i < len(word): - max_length_token = '' - for j in range(len(word), i, -1): - current_token = "".join(word[i:j]) - if current_token in vocabulary and len(current_token) > len(max_length_token): - max_length_token = current_token - if max_length_token: - tokens_identifiers.append(vocabulary[max_length_token]) - i += len(max_length_token) - else: - if unknown_token in vocabulary: - tokens_identifiers.append(vocabulary[unknown_token]) - i += 1 - return tokens_identifiers + word_str = ''.join(word) + sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) + for token in sorted_tokens: + if token in ''.join(word): + word_str = word_str.replace(token, str(vocabulary[token]) + ' ') + for symbol in ''.join(word): + if symbol not in sorted_tokens: + word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ') + return [int(identifier) for identifier in word_str.split()] def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -303,8 +296,7 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: all_precision = 1.0 for precision in precisions: all_precision *= precision - geometric_mean = all_precision**(1.0 / max_order) - return geometric_mean + return float(all_precision**(1.0 / max_order)) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -315,7 +307,8 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ - if not isinstance(actual, str) or not isinstance(reference, str) or not isinstance(max_order, int): + if (not isinstance(actual, str) or not isinstance(reference, str) + or not isinstance(max_order, int)): return None all_ngrams_actual = [] all_ngrams_reference = [] From 64399ca341154e55e9bab46d0a38d981c576df97 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 2 Nov 2023 16:46:45 +0300 Subject: [PATCH 41/81] start --- lab_2_tokenize_by_bpe/start.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 798e957e0..0765fb019 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -1,7 +1,9 @@ """ BPE Tokenizer starter """ +import json from pathlib import Path +from lab_2_tokenize_by_bpe.main import (calculate_bleu, decode, encode) def main() -> None: @@ -11,9 +13,25 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() + with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: + vocabulary = json.load(file) + with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: + ru_raw = file.read() + with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: + ru_encoded = file.read() - result = None - assert result, "Encoding is not working" + encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '') + correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())] + print(f"Файл закодирован правильно на {(len(correct_tokens) / len(encode_pred)*100)}%") + + with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: + encoded_en = file.read() + with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: + en_raw = file.read() + + decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None) + decoded_text = decoded_text.replace('\u2581', ' ') + print(f'BLUE = {calculate_bleu(decoded_text, en_raw)}') if __name__ == "__main__": From 26645768e9c1834f8f1619321fec402d401afa15 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 2 Nov 2023 17:06:17 +0300 Subject: [PATCH 42/81] start --- lab_2_tokenize_by_bpe/start.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 0765fb019..8da5b2473 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -3,7 +3,8 @@ """ import json from pathlib import Path -from lab_2_tokenize_by_bpe.main import (calculate_bleu, decode, encode) + +from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode def main() -> None: @@ -22,7 +23,8 @@ def main() -> None: encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '') correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())] - print(f"Файл закодирован правильно на {(len(correct_tokens) / len(encode_pred)*100)}%") + if correct_tokens: + print(f"Файл закодирован правильно на {(len(list(correct_tokens)) / len(encode_pred)*100)}%") with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: encoded_en = file.read() From de9a0c69d8c78f97f0ac70385e04bc4a2d731f1f Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 2 Nov 2023 17:14:32 +0300 Subject: [PATCH 43/81] start --- lab_2_tokenize_by_bpe/start.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 8da5b2473..1cf85801d 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -23,8 +23,9 @@ def main() -> None: encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '') correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())] - if correct_tokens: - print(f"Файл закодирован правильно на {(len(list(correct_tokens)) / len(encode_pred)*100)}%") + if correct_tokens and encode_pred: + print((f"Файл закодирован правильно на " + f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%")) with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: encoded_en = file.read() @@ -33,7 +34,9 @@ def main() -> None: decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None) decoded_text = decoded_text.replace('\u2581', ' ') - print(f'BLUE = {calculate_bleu(decoded_text, en_raw)}') + result = calculate_bleu(decoded_text, en_raw) + print(f'BLUE = {result}') + assert result, "Encoding is not working" if __name__ == "__main__": From 2b5df06b7bafba7fe577306ba23703dc9cab94db Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 2 Nov 2023 17:33:04 +0300 Subject: [PATCH 44/81] add fixes --- lab_2_tokenize_by_bpe/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 72b67cbf7..99764e3b5 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -89,7 +89,7 @@ def merge_tokens( list_word = list(preprocessed_word) for index in range(len(list_word) - 1): if (list_word[index], list_word[index + 1]) == pair: - list_word[index + 1] = pair[0] + pair[1] + list_word[index + 1] = ''.join(pair) list_word[index] = '' if '' in list_word: list_word.remove('') @@ -143,7 +143,7 @@ def get_vocabulary( for token in tuples: tokens_list.add(token) for element in token: - tokens_list.add(element) + tokens_list.update(element) tokens_list.add(unknown_token) sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x)) for index, token in enumerate(sorted_tokens): From fdada4585b0170b0ac050532bd88b66210316fa1 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:15:53 +0300 Subject: [PATCH 45/81] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 308 +++++++++++++-------------------- lab_2_tokenize_by_bpe/start.py | 39 ++--- 2 files changed, 140 insertions(+), 207 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 19a72913f..99764e3b5 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -3,7 +3,6 @@ BPE and machine translation evaluation """ import json -import math def prepare_word( @@ -20,12 +19,13 @@ def prepare_word( start_of_word, str) or start_of_word is None) or not ( isinstance(end_of_word, str) or end_of_word is None): return None - list_of_tokens = list(raw_word) - if end_of_word: - list_of_tokens.append(end_of_word) + tokenized_word = [] if start_of_word: - list_of_tokens.insert(0, start_of_word) - return tuple(list_of_tokens) + tokenized_word.append(start_of_word) + tokenized_word.extend(raw_word) + if end_of_word: + tokenized_word.append(end_of_word) + return tuple(tokenized_word) def collect_frequencies( @@ -41,17 +41,16 @@ def collect_frequencies( if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( isinstance(start_of_word, str) or start_of_word is None): return None - - dict_frequencies = {} - - splitted_text = text.split() - for i in set(splitted_text): - word = prepare_word(i, start_of_word, end_of_word) - if not word: - return None - dict_frequencies[word] = splitted_text.count(i) - - return dict_frequencies + frequencies_dict = {} + for word in text.split(): + if start_of_word is not None: + tokenized_word = prepare_word(word, start_of_word, end_of_word) + if start_of_word is None: + tokenized_word = prepare_word(word, None, end_of_word) + if tokenized_word is None: + return None + frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1 + return frequencies_dict def count_tokens_pairs( @@ -64,17 +63,12 @@ def count_tokens_pairs( """ if not isinstance(word_frequencies, dict): return None - - dict_with_pairs = {} - - for word in word_frequencies: - for index in range(len(word) - 1): - pair = (word[index], word[index + 1]) - if pair not in dict_with_pairs: - dict_with_pairs[pair] = 0 - dict_with_pairs[pair] += word_frequencies[word] - - return dict_with_pairs + pairs_of_tokens = {} + for tokens in word_frequencies: + for index in range(len(tokens) - 1): + pair = (tokens[index], tokens[index + 1]) + pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens] + return pairs_of_tokens def merge_tokens( @@ -86,24 +80,22 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ - if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): + if not (isinstance(word_frequencies, dict) + and isinstance(pair, tuple)): return None - dict_merged_tokens = {} - for i in word_frequencies: - list_word = list(i) - - for index in range(len(list_word) - 1): - if (i[index], i[index + 1]) == pair: - list_word[index + 1] = pair[0] + pair[1] - list_word[index] = '' - - if '' in list_word: - list_word.remove('') - dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) - else: - dict_merged_tokens.update({i: word_frequencies[i]}) - - return dict_merged_tokens + merged_frequencies = {} + for preprocessed_word, count in word_frequencies.items(): + if ''.join(pair) in ''.join(preprocessed_word): + list_word = list(preprocessed_word) + for index in range(len(list_word) - 1): + if (list_word[index], list_word[index + 1]) == pair: + list_word[index + 1] = ''.join(pair) + list_word[index] = '' + if '' in list_word: + list_word.remove('') + preprocessed_word = tuple(list_word) + merged_frequencies[preprocessed_word] = count + return merged_frequencies def train( @@ -117,31 +109,20 @@ def train( """ if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None - dict_with_pairs = count_tokens_pairs(word_frequencies) - - if not dict_with_pairs: - return None - merges = min(num_merges, len(dict_with_pairs)) - - for i in range(merges): - - max_values = max(dict_with_pairs.values()) - pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] - - max_len = max(len(str(pair)) for pair in pairs_max_values) - pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] - - sorted_pairs = sorted(pairs_max_len) + while num_merges > 0: + pairs_of_tokens = count_tokens_pairs(word_frequencies) + if not pairs_of_tokens: + return None + if num_merges > len(pairs_of_tokens): + num_merges = len(pairs_of_tokens) + pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if + frequency == max(pairs_of_tokens.values())]) + sorted_pairs = (sorted(pairs_max_values, + key=lambda pair: (-len(str(pair)), pair))) word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) - if not word_frequencies: return None - - dict_with_pairs = count_tokens_pairs(word_frequencies) - - if not dict_with_pairs: - return None - + num_merges -= 1 return word_frequencies @@ -156,24 +137,18 @@ def get_vocabulary( """ if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): return None - - dict_ident = {} - unique_tokens = set() - - for tuple_tokens in word_frequencies.keys(): - for word in tuple_tokens: - unique_tokens.update(tuple_tokens, word) - - unique_tokens.add(unknown_token) - lex_sorted = sorted(unique_tokens) - len_sorted = sorted(lex_sorted, key=len, reverse=True) - index = 0 - - for token in len_sorted: - dict_ident[token] = index - index += 1 - - return dict_ident + tokens_list = set() + dict_token_identifier = {} + for tuples in word_frequencies: + for token in tuples: + tokens_list.add(token) + for element in token: + tokens_list.update(element) + tokens_list.add(unknown_token) + sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x)) + for index, token in enumerate(sorted_tokens): + dict_token_identifier[token] = index + return dict_token_identifier def decode( @@ -189,17 +164,15 @@ def decode( if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( end_of_word_token, str) or end_of_word_token is None): return None - decoded = '' - for identifier in encoded_text: - token_list = [key for key in vocabulary if vocabulary[key] == identifier] - - for token in token_list: - decoded += token - - if end_of_word_token: - decoded = decoded.replace(end_of_word_token, ' ') - - return decoded + decoded_tokens = [] + for index in encoded_text: + for token, token_index in vocabulary.items(): + if token_index == index and end_of_word_token is not None: + decoded_tokens.append(' ' if token == end_of_word_token else token) + if vocabulary[token] == index and end_of_word_token is None: + decoded_tokens.append('' if token == end_of_word_token else token) + decoded_text = ''.join(decoded_tokens) + return decoded_text def tokenize_word( @@ -213,27 +186,19 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( - end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): + if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word) + or not isinstance(vocabulary, dict) or not isinstance( + end_of_word, (str, type(None))) or not isinstance(unknown_token, str)): return None - - word_copy = ''.join(word) - sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) - result = [] - - for key in sorted_vocabulary: - while key in word_copy: - index = word_copy.count(' ', 0, word_copy.find(key)) - result.insert(index, vocabulary[key]) - word_copy = word_copy.replace(key, ' ', 1) - - for unk in word_copy: - if unk != ' ': - index = word_copy.find(unk) - word_copy = word_copy.replace(unk, ' ') - result.insert(index, vocabulary[unknown_token]) - - return result + word_str = ''.join(word) + sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) + for token in sorted_tokens: + if token in ''.join(word): + word_str = word_str.replace(token, str(vocabulary[token]) + ' ') + for symbol in ''.join(word): + if symbol not in sorted_tokens: + word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ') + return [int(identifier) for identifier in word_str.split()] def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -244,14 +209,11 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: """ if not isinstance(vocab_path, str): return None - with open(vocab_path, 'r', encoding='utf-8') as f: - vocab = json.load(f) - - if not isinstance(vocab, dict): + vocabulary = json.load(f) + if not isinstance(vocabulary, dict): return None - - return vocab + return vocabulary def encode( @@ -270,26 +232,20 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(original_text, str) or not isinstance( - vocabulary, dict) or not (isinstance( - start_of_word_token, str) or start_of_word_token is None) or not (isinstance( - end_of_word_token, str) or end_of_word_token is None) or not isinstance( + if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance( unknown_token, str): return None - - encoded = [] - split_text = original_text.split() - - for word in split_text: - prepared = prepare_word(word, start_of_word_token, end_of_word_token) - if not prepared: + list_token_identifiers = [] + text = original_text.split() + for word in text: + prepared_word = prepare_word(word, start_of_word_token, end_of_word_token) + if not prepared_word: return None - result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) - if not result: + tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token) + if not tokens_id: return None - encoded.extend(result) - - return encoded + list_token_identifiers.extend(tokens_id) + return list_token_identifiers def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -301,12 +257,10 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: """ if not isinstance(text, str) or not isinstance(order, int): return None - - n_grams = [] + sequence_ngrams = [] for index in range(len(text) + 1 - order): - n_grams.append(tuple(text[index: index + order])) - - return n_grams + sequence_ngrams.append(tuple(text[index:order+index])) + return sequence_ngrams def calculate_precision( @@ -320,15 +274,12 @@ def calculate_precision( """ if not isinstance(actual, list) or not isinstance(reference, list): return None - - unique_ngrams = set(reference) - matches = 0 - - for n_gram in unique_ngrams: - if n_gram in actual: - matches += 1 - - return matches / len(unique_ngrams) + if len(actual) == 0: + return 0.0 + unique_reference = set(reference) + identical_tokens = [token for token in unique_reference if token in actual] + precision = len(identical_tokens) / len(unique_reference) + return precision def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -340,15 +291,12 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: """ if not isinstance(precisions, list) or not isinstance(max_order, int): return None - - summation = float(0) - - for order in range(max_order): - if precisions[order] < 0: - return 0 - summation += math.log(precisions[order]) - - return math.exp(1 / max_order * summation) + if not precisions or max_order <= 0: + return None + all_precision = 1.0 + for precision in precisions: + all_precision *= precision + return float(all_precision**(1.0 / max_order)) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -359,31 +307,25 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ - if not isinstance(actual, str) or not isinstance( - reference, str) or max_order != 3: + if (not isinstance(actual, str) or not isinstance(reference, str) + or not isinstance(max_order, int)): return None - - actual_ngrams = [] - reference_ngrams = [] - + all_ngrams_actual = [] + all_ngrams_reference = [] for order in range(max_order): - actual_ngram = collect_ngrams(actual, order + 1) - reference_ngram = collect_ngrams(reference, order + 1) - if actual_ngram is None or reference_ngram is None: + ngrams_actual = collect_ngrams(actual, order + 1) + ngrams_reference = collect_ngrams(reference, order + 1) + if not ngrams_actual or not ngrams_reference: return None - actual_ngrams.append(actual_ngram) - reference_ngrams.append(reference_ngram) - + all_ngrams_actual.append(ngrams_actual) + all_ngrams_reference.append(ngrams_reference) precisions = [] - - for i, j in zip(actual_ngrams, reference_ngrams): - precision = calculate_precision(i, j) - if precision is None: + for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference): + presision = calculate_precision(ngrams_actual, ngrams_reference) + if not presision: return None - precisions.append(precision) - - average = geo_mean(precisions, max_order) - if average is None: + precisions.append(presision) + blue_metric = geo_mean(precisions, max_order) + if blue_metric is None: return None - - return average * 100 + return blue_metric * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index d71b1c9c4..1cf85801d 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -4,8 +4,7 @@ import json from pathlib import Path -from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, - get_vocabulary, train) +from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode def main() -> None: @@ -15,37 +14,29 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() - with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: - encoded_secret = text_file.read() - dict_frequencies = collect_frequencies(text, None, '') - merged_tokens = train(dict_frequencies, 100) - if merged_tokens: - vocabulary = get_vocabulary(merged_tokens, '') - secret = [int(num) for num in encoded_secret.split()] - result = decode(secret, vocabulary, '') - print(result) - assert result, "Encoding is not working" - - with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: - predicted = file.read() with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: vocabulary = json.load(file) + with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: + ru_raw = file.read() with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: - actual = file.read() + ru_encoded = file.read() - if [int(token) for token in actual.split()] == encode( - predicted, vocabulary, '\u2581', None, ''): - print("Encoding is successful!") + encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '') + correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())] + if correct_tokens and encode_pred: + print((f"Файл закодирован правильно на " + f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%")) with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: encoded_en = file.read() with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: - decoded_en = file.read() - - decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) - decoded = decoded.replace('\u2581', ' ') + en_raw = file.read() - print(calculate_bleu(decoded, decoded_en)) + decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None) + decoded_text = decoded_text.replace('\u2581', ' ') + result = calculate_bleu(decoded_text, en_raw) + print(f'BLUE = {result}') + assert result, "Encoding is not working" if __name__ == "__main__": From e680ce38993f92dbff17be0cc6c0e25419d074a9 Mon Sep 17 00:00:00 2001 From: artyomtugaryov Date: Fri, 3 Nov 2023 17:24:32 +0300 Subject: [PATCH 46/81] checkout labs from the origin repository --- lab_2_tokenize_by_bpe/main.py | 308 ++++++++++++++++++++------------- lab_2_tokenize_by_bpe/start.py | 39 +++-- 2 files changed, 207 insertions(+), 140 deletions(-) diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py index 99764e3b5..19a72913f 100644 --- a/lab_2_tokenize_by_bpe/main.py +++ b/lab_2_tokenize_by_bpe/main.py @@ -3,6 +3,7 @@ BPE and machine translation evaluation """ import json +import math def prepare_word( @@ -19,13 +20,12 @@ def prepare_word( start_of_word, str) or start_of_word is None) or not ( isinstance(end_of_word, str) or end_of_word is None): return None - tokenized_word = [] - if start_of_word: - tokenized_word.append(start_of_word) - tokenized_word.extend(raw_word) + list_of_tokens = list(raw_word) if end_of_word: - tokenized_word.append(end_of_word) - return tuple(tokenized_word) + list_of_tokens.append(end_of_word) + if start_of_word: + list_of_tokens.insert(0, start_of_word) + return tuple(list_of_tokens) def collect_frequencies( @@ -41,16 +41,17 @@ def collect_frequencies( if not isinstance(text, str) or not isinstance(end_of_word, str) or not ( isinstance(start_of_word, str) or start_of_word is None): return None - frequencies_dict = {} - for word in text.split(): - if start_of_word is not None: - tokenized_word = prepare_word(word, start_of_word, end_of_word) - if start_of_word is None: - tokenized_word = prepare_word(word, None, end_of_word) - if tokenized_word is None: - return None - frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1 - return frequencies_dict + + dict_frequencies = {} + + splitted_text = text.split() + for i in set(splitted_text): + word = prepare_word(i, start_of_word, end_of_word) + if not word: + return None + dict_frequencies[word] = splitted_text.count(i) + + return dict_frequencies def count_tokens_pairs( @@ -63,12 +64,17 @@ def count_tokens_pairs( """ if not isinstance(word_frequencies, dict): return None - pairs_of_tokens = {} - for tokens in word_frequencies: - for index in range(len(tokens) - 1): - pair = (tokens[index], tokens[index + 1]) - pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens] - return pairs_of_tokens + + dict_with_pairs = {} + + for word in word_frequencies: + for index in range(len(word) - 1): + pair = (word[index], word[index + 1]) + if pair not in dict_with_pairs: + dict_with_pairs[pair] = 0 + dict_with_pairs[pair] += word_frequencies[word] + + return dict_with_pairs def merge_tokens( @@ -80,22 +86,24 @@ def merge_tokens( :param pair: a pair of tokens to be merged :return: dictionary in the form of """ - if not (isinstance(word_frequencies, dict) - and isinstance(pair, tuple)): + if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple): return None - merged_frequencies = {} - for preprocessed_word, count in word_frequencies.items(): - if ''.join(pair) in ''.join(preprocessed_word): - list_word = list(preprocessed_word) - for index in range(len(list_word) - 1): - if (list_word[index], list_word[index + 1]) == pair: - list_word[index + 1] = ''.join(pair) - list_word[index] = '' - if '' in list_word: - list_word.remove('') - preprocessed_word = tuple(list_word) - merged_frequencies[preprocessed_word] = count - return merged_frequencies + dict_merged_tokens = {} + for i in word_frequencies: + list_word = list(i) + + for index in range(len(list_word) - 1): + if (i[index], i[index + 1]) == pair: + list_word[index + 1] = pair[0] + pair[1] + list_word[index] = '' + + if '' in list_word: + list_word.remove('') + dict_merged_tokens.update({tuple(list_word): word_frequencies[i]}) + else: + dict_merged_tokens.update({i: word_frequencies[i]}) + + return dict_merged_tokens def train( @@ -109,20 +117,31 @@ def train( """ if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int): return None - while num_merges > 0: - pairs_of_tokens = count_tokens_pairs(word_frequencies) - if not pairs_of_tokens: - return None - if num_merges > len(pairs_of_tokens): - num_merges = len(pairs_of_tokens) - pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if - frequency == max(pairs_of_tokens.values())]) - sorted_pairs = (sorted(pairs_max_values, - key=lambda pair: (-len(str(pair)), pair))) + dict_with_pairs = count_tokens_pairs(word_frequencies) + + if not dict_with_pairs: + return None + merges = min(num_merges, len(dict_with_pairs)) + + for i in range(merges): + + max_values = max(dict_with_pairs.values()) + pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values] + + max_len = max(len(str(pair)) for pair in pairs_max_values) + pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len] + + sorted_pairs = sorted(pairs_max_len) word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0]) + if not word_frequencies: return None - num_merges -= 1 + + dict_with_pairs = count_tokens_pairs(word_frequencies) + + if not dict_with_pairs: + return None + return word_frequencies @@ -137,18 +156,24 @@ def get_vocabulary( """ if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str): return None - tokens_list = set() - dict_token_identifier = {} - for tuples in word_frequencies: - for token in tuples: - tokens_list.add(token) - for element in token: - tokens_list.update(element) - tokens_list.add(unknown_token) - sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x)) - for index, token in enumerate(sorted_tokens): - dict_token_identifier[token] = index - return dict_token_identifier + + dict_ident = {} + unique_tokens = set() + + for tuple_tokens in word_frequencies.keys(): + for word in tuple_tokens: + unique_tokens.update(tuple_tokens, word) + + unique_tokens.add(unknown_token) + lex_sorted = sorted(unique_tokens) + len_sorted = sorted(lex_sorted, key=len, reverse=True) + index = 0 + + for token in len_sorted: + dict_ident[token] = index + index += 1 + + return dict_ident def decode( @@ -164,15 +189,17 @@ def decode( if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance( end_of_word_token, str) or end_of_word_token is None): return None - decoded_tokens = [] - for index in encoded_text: - for token, token_index in vocabulary.items(): - if token_index == index and end_of_word_token is not None: - decoded_tokens.append(' ' if token == end_of_word_token else token) - if vocabulary[token] == index and end_of_word_token is None: - decoded_tokens.append('' if token == end_of_word_token else token) - decoded_text = ''.join(decoded_tokens) - return decoded_text + decoded = '' + for identifier in encoded_text: + token_list = [key for key in vocabulary if vocabulary[key] == identifier] + + for token in token_list: + decoded += token + + if end_of_word_token: + decoded = decoded.replace(end_of_word_token, ' ') + + return decoded def tokenize_word( @@ -186,19 +213,27 @@ def tokenize_word( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word) - or not isinstance(vocabulary, dict) or not isinstance( - end_of_word, (str, type(None))) or not isinstance(unknown_token, str)): + if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance( + end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str): return None - word_str = ''.join(word) - sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) - for token in sorted_tokens: - if token in ''.join(word): - word_str = word_str.replace(token, str(vocabulary[token]) + ' ') - for symbol in ''.join(word): - if symbol not in sorted_tokens: - word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ') - return [int(identifier) for identifier in word_str.split()] + + word_copy = ''.join(word) + sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x)) + result = [] + + for key in sorted_vocabulary: + while key in word_copy: + index = word_copy.count(' ', 0, word_copy.find(key)) + result.insert(index, vocabulary[key]) + word_copy = word_copy.replace(key, ' ', 1) + + for unk in word_copy: + if unk != ' ': + index = word_copy.find(unk) + word_copy = word_copy.replace(unk, ' ') + result.insert(index, vocabulary[unknown_token]) + + return result def load_vocabulary(vocab_path: str) -> dict[str, int] | None: @@ -209,11 +244,14 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None: """ if not isinstance(vocab_path, str): return None + with open(vocab_path, 'r', encoding='utf-8') as f: - vocabulary = json.load(f) - if not isinstance(vocabulary, dict): + vocab = json.load(f) + + if not isinstance(vocab, dict): return None - return vocabulary + + return vocab def encode( @@ -232,20 +270,26 @@ def encode( :param unknown_token: token that signifies unknown sequence :return: list of token identifiers """ - if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance( + if not isinstance(original_text, str) or not isinstance( + vocabulary, dict) or not (isinstance( + start_of_word_token, str) or start_of_word_token is None) or not (isinstance( + end_of_word_token, str) or end_of_word_token is None) or not isinstance( unknown_token, str): return None - list_token_identifiers = [] - text = original_text.split() - for word in text: - prepared_word = prepare_word(word, start_of_word_token, end_of_word_token) - if not prepared_word: + + encoded = [] + split_text = original_text.split() + + for word in split_text: + prepared = prepare_word(word, start_of_word_token, end_of_word_token) + if not prepared: return None - tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token) - if not tokens_id: + result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token) + if not result: return None - list_token_identifiers.extend(tokens_id) - return list_token_identifiers + encoded.extend(result) + + return encoded def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: @@ -257,10 +301,12 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None: """ if not isinstance(text, str) or not isinstance(order, int): return None - sequence_ngrams = [] + + n_grams = [] for index in range(len(text) + 1 - order): - sequence_ngrams.append(tuple(text[index:order+index])) - return sequence_ngrams + n_grams.append(tuple(text[index: index + order])) + + return n_grams def calculate_precision( @@ -274,12 +320,15 @@ def calculate_precision( """ if not isinstance(actual, list) or not isinstance(reference, list): return None - if len(actual) == 0: - return 0.0 - unique_reference = set(reference) - identical_tokens = [token for token in unique_reference if token in actual] - precision = len(identical_tokens) / len(unique_reference) - return precision + + unique_ngrams = set(reference) + matches = 0 + + for n_gram in unique_ngrams: + if n_gram in actual: + matches += 1 + + return matches / len(unique_ngrams) def geo_mean(precisions: list[float], max_order: int) -> float | None: @@ -291,12 +340,15 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None: """ if not isinstance(precisions, list) or not isinstance(max_order, int): return None - if not precisions or max_order <= 0: - return None - all_precision = 1.0 - for precision in precisions: - all_precision *= precision - return float(all_precision**(1.0 / max_order)) + + summation = float(0) + + for order in range(max_order): + if precisions[order] < 0: + return 0 + summation += math.log(precisions[order]) + + return math.exp(1 / max_order * summation) def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None: @@ -307,25 +359,31 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl :param max_order: max length of n-gram to consider for comparison :return: value of BLEU metric """ - if (not isinstance(actual, str) or not isinstance(reference, str) - or not isinstance(max_order, int)): + if not isinstance(actual, str) or not isinstance( + reference, str) or max_order != 3: return None - all_ngrams_actual = [] - all_ngrams_reference = [] + + actual_ngrams = [] + reference_ngrams = [] + for order in range(max_order): - ngrams_actual = collect_ngrams(actual, order + 1) - ngrams_reference = collect_ngrams(reference, order + 1) - if not ngrams_actual or not ngrams_reference: + actual_ngram = collect_ngrams(actual, order + 1) + reference_ngram = collect_ngrams(reference, order + 1) + if actual_ngram is None or reference_ngram is None: return None - all_ngrams_actual.append(ngrams_actual) - all_ngrams_reference.append(ngrams_reference) + actual_ngrams.append(actual_ngram) + reference_ngrams.append(reference_ngram) + precisions = [] - for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference): - presision = calculate_precision(ngrams_actual, ngrams_reference) - if not presision: + + for i, j in zip(actual_ngrams, reference_ngrams): + precision = calculate_precision(i, j) + if precision is None: return None - precisions.append(presision) - blue_metric = geo_mean(precisions, max_order) - if blue_metric is None: + precisions.append(precision) + + average = geo_mean(precisions, max_order) + if average is None: return None - return blue_metric * 100 + + return average * 100 diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py index 1cf85801d..d71b1c9c4 100644 --- a/lab_2_tokenize_by_bpe/start.py +++ b/lab_2_tokenize_by_bpe/start.py @@ -4,7 +4,8 @@ import json from pathlib import Path -from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode +from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode, + get_vocabulary, train) def main() -> None: @@ -14,29 +15,37 @@ def main() -> None: assets_path = Path(__file__).parent / 'assets' with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file: text = text_file.read() + with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file: + encoded_secret = text_file.read() + dict_frequencies = collect_frequencies(text, None, '') + merged_tokens = train(dict_frequencies, 100) + if merged_tokens: + vocabulary = get_vocabulary(merged_tokens, '') + secret = [int(num) for num in encoded_secret.split()] + result = decode(secret, vocabulary, '') + print(result) + assert result, "Encoding is not working" + + with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: + predicted = file.read() with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file: vocabulary = json.load(file) - with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file: - ru_raw = file.read() with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file: - ru_encoded = file.read() + actual = file.read() - encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '') - correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())] - if correct_tokens and encode_pred: - print((f"Файл закодирован правильно на " - f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%")) + if [int(token) for token in actual.split()] == encode( + predicted, vocabulary, '\u2581', None, ''): + print("Encoding is successful!") with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file: encoded_en = file.read() with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file: - en_raw = file.read() + decoded_en = file.read() + + decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None) + decoded = decoded.replace('\u2581', ' ') - decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None) - decoded_text = decoded_text.replace('\u2581', ' ') - result = calculate_bleu(decoded_text, en_raw) - print(f'BLUE = {result}') - assert result, "Encoding is not working" + print(calculate_bleu(decoded, decoded_en)) if __name__ == "__main__": From 5217fa8a5e35f26458911f6185fc8ea581eee0ed Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 7 Nov 2023 13:36:11 +0300 Subject: [PATCH 47/81] changes for 4 --- lab_3_generate_by_ngrams/main.py | 86 ++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index dcf4e8af9..591f7b815 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -23,6 +23,8 @@ def __init__(self, end_of_word_token: str) -> None: Args: end_of_word_token (str): A token denoting word boundary """ + self._end_of_word_token = end_of_word_token + self._storage = {'_': 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ @@ -41,6 +43,19 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str) or text == '': + return None + + list_text = text.split() + str_text = self._end_of_word_token.join(list_text) + new_str = '' + for token in str_text: + if token.isalpha() or token == '_': + new_str += token + new_str += '_' + if '__' in new_str: + copy_str = new_str.replace('__', '_') + return tuple([token.lower() for token in copy_str]) def get_id(self, element: str) -> Optional[int]: """ @@ -55,6 +70,10 @@ def get_id(self, element: str) -> Optional[int]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element, str) or element not in self._storage: + return None + + return self._storage[element] def get_end_of_word_token(self) -> str: """ @@ -63,6 +82,7 @@ def get_end_of_word_token(self) -> str: Returns: str: EoW token """ + return self._end_of_word_token def get_token(self, element_id: int) -> Optional[str]: """ @@ -76,6 +96,12 @@ def get_token(self, element_id: int) -> Optional[str]: In case of corrupt input arguments or arguments not included in storage, None is returned """ + if not isinstance(element_id, int) or element_id not in self._storage.values(): + return None + + for key in self._storage: + if self._storage[key] == element_id: + return key def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -93,6 +119,21 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(text, str) or len(text) == 0: + return None + + encoded_text = [] + tokenized_text = self._tokenize(text) + if not tokenized_text: + return None + + for token in tokenized_text: + self._put(token) + element_id = self.get_id(token) + if not isinstance(element_id, int): + return None + encoded_text.append(element_id) + return tuple(encoded_text) def _put(self, element: str) -> None: """ @@ -104,6 +145,11 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ + if not isinstance(element, str) or len(element) != 1: + return None + if element not in self._storage: + self._storage[element] = len(self._storage) + def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -121,6 +167,16 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(encoded_corpus, tuple) or not encoded_corpus: + return None + + decoded_tokens = self._decode(encoded_corpus) + if not decoded_tokens: + return None + decoded_text = self._postprocess_decoded_text(decoded_tokens) + if not decoded_text: + return None + return decoded_text def fill_from_ngrams(self, content: dict) -> None: """ @@ -129,6 +185,12 @@ def fill_from_ngrams(self, content: dict) -> None: Args: content (dict): ngrams from external JSON """ + if not isinstance(content, dict) or not content: + return None + for key in content['freq']: + for element in key: + if element.isalpha(): + self._put(element) def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: """ @@ -143,6 +205,18 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ + if not isinstance(corpus, tuple) or not corpus: + return None + + list_corpus = [] + for element_id in corpus: + if not isinstance(element_id, int): + return None + token = self.get_token(element_id) + if not token: + return None + list_corpus.append(token) + return tuple(list_corpus) def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]: """ @@ -159,6 +233,18 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional In case of corrupt input arguments, None is returned """ + if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + return None + + decoded_text = decoded_corpus[0].upper() + for token in decoded_corpus[1:-1]: + if token == self._end_of_word_token: + decoded_text += ' ' + else: + decoded_text += token + if decoded_corpus[-1] != self._end_of_word_token: + decoded_text += decoded_corpus[-1] + return decoded_text + '.' class NGramLanguageModel: From 8426d68918b3edec6b26ca38cd2e7ef101e3bfd1 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 7 Nov 2023 13:37:10 +0300 Subject: [PATCH 48/81] start --- lab_3_generate_by_ngrams/start.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index b9bcbd999..c04e32b39 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,6 +1,7 @@ """ Generation by NGrams starter """ +from lab_3_generate_by_ngrams.main import TextProcessor def main() -> None: @@ -11,8 +12,11 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None - assert result + corpus = TextProcessor('_') + encoded_text = corpus.encode(text) + decoded_text = corpus.decode(encoded_text) + print(encoded_text) + print(decoded_text) if __name__ == "__main__": From d396d1ec27dee0cd919413ed0e1c386c3c0b9f61 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 7 Nov 2023 13:40:20 +0300 Subject: [PATCH 49/81] score --- lab_3_generate_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 573541ac9..b8626c4cf 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +4 From f6ed07b1b71a6314816cdff367f45387bb4036b1 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 7 Nov 2023 13:45:25 +0300 Subject: [PATCH 50/81] start --- lab_3_generate_by_ngrams/start.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index c04e32b39..7eee1289d 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -14,9 +14,10 @@ def main() -> None: text = text_file.read() corpus = TextProcessor('_') encoded_text = corpus.encode(text) - decoded_text = corpus.decode(encoded_text) + result = corpus.decode(encoded_text) print(encoded_text) - print(decoded_text) + print(result) + assert result if __name__ == "__main__": From ed8da3ef22b9ffa4d834a170eea2490ac4a8b703 Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 10 Nov 2023 09:29:19 +0300 Subject: [PATCH 51/81] added fixes --- lab_3_generate_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 591f7b815..1148c1de4 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -150,7 +150,6 @@ def _put(self, element: str) -> None: if element not in self._storage: self._storage[element] = len(self._storage) - def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ Decode and postprocess encoded corpus by converting integer identifiers to string. @@ -266,6 +265,7 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None: n_gram_size (int): A size of n-grams to use for language modelling """ + def get_n_gram_size(self) -> int: """ Retrieve value stored in self._n_gram_size attribute. From baab5b64a81a975ccc6f49f69696f2f2dacf1f3b Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 15 Nov 2023 13:24:00 +0300 Subject: [PATCH 52/81] added fixes --- lab_3_generate_by_ngrams/main.py | 68 +++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 1148c1de4..5ba7f8431 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -24,7 +24,7 @@ def __init__(self, end_of_word_token: str) -> None: end_of_word_token (str): A token denoting word boundary """ self._end_of_word_token = end_of_word_token - self._storage = {'_': 0} + self._storage = {end_of_word_token: 0} def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: """ @@ -43,7 +43,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or text == '': + if not isinstance(text, str) or not text: return None list_text = text.split() @@ -264,6 +264,9 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None: encoded_corpus (tuple): Encoded text n_gram_size (int): A size of n-grams to use for language modelling """ + self._n_gram_size = n_gram_size + self._n_gram_frequencies = {} + self._encoded_corpus = encoded_corpus def get_n_gram_size(self) -> int: @@ -273,6 +276,7 @@ def get_n_gram_size(self) -> int: Returns: int: Size of stored n_grams """ + return self._n_gram_size def set_n_grams(self, frequencies: dict) -> None: """ @@ -281,6 +285,10 @@ def set_n_grams(self, frequencies: dict) -> None: Args: frequencies (dict): Computed in advance frequencies for n-grams """ + if not isinstance(frequencies, dict) or len(frequencies) == 0: + return None + self._n_gram_frequencies = frequencies + return None def build(self) -> int: """ @@ -294,6 +302,21 @@ def build(self) -> int: In case of corrupt input arguments or methods used return None, 1 is returned """ + if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0: + return 1 + + n_grams = self._extract_n_grams(self._encoded_corpus) + if not n_grams: + return 1 + + for n_gram in set(n_grams): + if not isinstance(n_gram, tuple): + return 1 + absolute_frequency = n_grams.count(n_gram) + with_same_beginning = len([id for id in n_grams if + id[:-1] == n_gram[:-1]]) + self._n_gram_frequencies[n_gram] = absolute_frequency / with_same_beginning + return 0 def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: """ @@ -307,6 +330,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ + if (not isinstance(sequence, tuple) or len(sequence) == 0 + or len(sequence) >= self._n_gram_size - 1): + return None + + tokens = {} + for ngram, freq in self._n_gram_frequencies.items(): + if sequence[-self._n_gram_size + 1:] == ngram[:self._n_gram_size - 1]: + tokens[self._n_gram_size - 1] = freq + return tokens def _extract_n_grams( self, encoded_corpus: tuple[int, ...] @@ -322,6 +354,15 @@ def _extract_n_grams( In case of corrupt input arguments, None is returned """ + if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0: + return None + + n_grams = [] + list_encoded_corpus = list(encoded_corpus) + for index in range(len(encoded_corpus) + 1 - self._n_gram_size): + n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size])) + + return tuple(n_grams) class GreedyTextGenerator: @@ -341,6 +382,8 @@ def __init__(self, language_model: NGramLanguageModel, text_processor: TextProce language_model (NGramLanguageModel): A language model to use for text generation text_processor (TextProcessor): A TextProcessor instance to handle text processing """ + self._model = language_model + self._text_processor = text_processor def run(self, seq_len: int, prompt: str) -> Optional[str]: """ @@ -356,6 +399,27 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0: + return None + + n_gram_size = self._model.get_n_gram_size() + encoded = self._text_processor.encode(prompt) + if not encoded or not n_gram_size: + return None + + while seq_len > 0: + candidates = self._model.generate_next_token(encoded[-n_gram_size + 1:]) + if not candidates: + break + best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())] + max_freq_letters = sorted(best_candidate) + encoded += (max_freq_letters[0]) + seq_len -= 1 + decoded_prompt = self._text_processor.decode(encoded) + if decoded_prompt is None: + return None + return decoded_prompt + class BeamSearcher: From b5a6ad703af8980de55f3e7f739150798e9e7da1 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 15 Nov 2023 13:26:19 +0300 Subject: [PATCH 53/81] score --- lab_3_generate_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index b8626c4cf..1e8b31496 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -4 +6 From 449f2ace1283a1b0eb3c463b20fb5cf8efbe810d Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 15 Nov 2023 16:44:22 +0300 Subject: [PATCH 54/81] added fixes --- lab_3_generate_by_ngrams/main.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 5ba7f8431..99ba842da 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -43,19 +43,21 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(text, str) or not text: + if not isinstance(text, str): return None - list_text = text.split() - str_text = self._end_of_word_token.join(list_text) - new_str = '' - for token in str_text: - if token.isalpha() or token == '_': - new_str += token - new_str += '_' - if '__' in new_str: - copy_str = new_str.replace('__', '_') - return tuple([token.lower() for token in copy_str]) + tokens = [] + list_text = text.lower().split() + for element in list_text: + word = [token for token in element if token.isalpha()] + if word: + tokens.extend(word) + tokens.append(self._end_of_word_token) + if not tokens: + return None + if text[-1].isalnum(): + tokens.pop() + return tuple(tokens) def get_id(self, element: str) -> Optional[int]: """ From 99153b2759e6bdc9f21ecfa0ba1dbf85c99c556e Mon Sep 17 00:00:00 2001 From: mmarina Date: Fri, 17 Nov 2023 09:28:25 +0300 Subject: [PATCH 55/81] added fixes --- lab_3_generate_by_ngrams/main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 99ba842da..b43df6f59 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -290,7 +290,6 @@ def set_n_grams(self, frequencies: dict) -> None: if not isinstance(frequencies, dict) or len(frequencies) == 0: return None self._n_gram_frequencies = frequencies - return None def build(self) -> int: """ @@ -337,8 +336,8 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: return None tokens = {} - for ngram, freq in self._n_gram_frequencies.items(): - if sequence[-self._n_gram_size + 1:] == ngram[:self._n_gram_size - 1]: + for n_gram, freq in self._n_gram_frequencies.items(): + if sequence[-self._n_gram_size + 1:] == n_gram[:self._n_gram_size - 1]: tokens[self._n_gram_size - 1] = freq return tokens From 30c7852af0ae85180146a398709523e0711d64c0 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 21 Nov 2023 09:26:03 +0300 Subject: [PATCH 56/81] added fixes --- lab_3_generate_by_ngrams/main.py | 55 ++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index b43df6f59..1585dc499 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -4,6 +4,7 @@ Beam-search and natural language generation evaluation """ # pylint:disable=too-few-public-methods +import math from typing import Optional @@ -101,8 +102,8 @@ def get_token(self, element_id: int) -> Optional[str]: if not isinstance(element_id, int) or element_id not in self._storage.values(): return None - for key in self._storage: - if self._storage[key] == element_id: + for key, value in self._storage.items(): + if value == element_id: return key def encode(self, text: str) -> Optional[tuple[int, ...]]: @@ -151,6 +152,7 @@ def _put(self, element: str) -> None: return None if element not in self._storage: self._storage[element] = len(self._storage) + return None def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -331,14 +333,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]: In case of corrupt input arguments, None is returned """ - if (not isinstance(sequence, tuple) or len(sequence) == 0 - or len(sequence) >= self._n_gram_size - 1): + if (not isinstance(sequence, tuple) or not sequence + or len(sequence) < self._n_gram_size - 1): return None + context = sequence[-self._n_gram_size + 1:] tokens = {} for n_gram, freq in self._n_gram_frequencies.items(): - if sequence[-self._n_gram_size + 1:] == n_gram[:self._n_gram_size - 1]: - tokens[self._n_gram_size - 1] = freq + if n_gram[:len(context)] == context: + tokens[n_gram[len(context)]] = freq return tokens def _extract_n_grams( @@ -440,6 +443,8 @@ def __init__(self, beam_width: int, language_model: NGramLanguageModel) -> None: beam_width (int): Number of candidates to consider at each step language_model (NGramLanguageModel): A language model to use for next token prediction """ + self._beam_width = beam_width + self._model = language_model def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, float]]]: """ @@ -460,6 +465,17 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, In case of corrupt input arguments or methods used return None. """ + if not isinstance(sequence, tuple) or len(sequence) == 0: + return None + + tokens_dict = self._model.generate_next_token(sequence) + if tokens_dict is None: + return None + if not tokens_dict: + return [] + + return sorted([(token, float(probability)) for token, probability in tokens_dict.items()], + key=lambda x: x[1], reverse=True)[:self._beam_width] def continue_sequence( self, @@ -482,6 +498,28 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ + if (not isinstance(sequence, tuple) or not isinstance(next_tokens, list) or + not isinstance(sequence_candidates, dict) or not sequence): + return None + if (not next_tokens or not sequence_candidates or + sequence not in sequence_candidates or + len(next_tokens) > self._beam_width): + return None + + new_sequence_candidates = {} + for key, value in sequence_candidates.items(): + if key != sequence: + probability = value + for token, token_probability in next_tokens: + new_sequence = key + (token,) + new_probability = probability + (-1) * math.log(token_probability) + new_sequence_candidates[new_sequence] = new_probability + + if len(new_sequence_candidates) > self._beam_width: + new_sequence_candidates = dict( + sorted(new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width]) + + return new_sequence_candidates def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] @@ -497,6 +535,11 @@ def prune_sequence_candidates( In case of corrupt input arguments return None. """ + if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0: + return None + + sorted_candidates = dict(sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0]), reverse=True)) + return dict(list(sorted_candidates.items())[:self._beam_width]) class BeamSearchTextGenerator: From 60617709ec3c1f6b155f0ac2c5a3304f54d96de3 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 22 Nov 2023 22:17:12 +0300 Subject: [PATCH 57/81] changes for 8 --- lab_3_generate_by_ngrams/main.py | 53 +++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 1585dc499..e9a8655cf 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -412,17 +412,14 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: return None while seq_len > 0: - candidates = self._model.generate_next_token(encoded[-n_gram_size + 1:]) + candidates = self._model.generate_next_token(encoded) if not candidates: break best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())] max_freq_letters = sorted(best_candidate) - encoded += (max_freq_letters[0]) + encoded += (max_freq_letters[0],) seq_len -= 1 - decoded_prompt = self._text_processor.decode(encoded) - if decoded_prompt is None: - return None - return decoded_prompt + return self._text_processor.decode(encoded) @@ -475,7 +472,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, return [] return sorted([(token, float(probability)) for token, probability in tokens_dict.items()], - key=lambda x: x[1], reverse=True)[:self._beam_width] + key=lambda x: x[1], reverse=True)[:self._beam_width] def continue_sequence( self, @@ -567,6 +564,10 @@ def __init__( text_processor (TextProcessor): A TextProcessor instance to handle text processing beam_width (int): Beam width parameter for generation """ + self._language_model = language_model + self._text_processor = text_processor + self._beam_width = beam_width + self.beam_searcher = BeamSearcher(beam_width, language_model) def run(self, prompt: str, seq_len: int) -> Optional[str]: """ @@ -582,6 +583,37 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: In case of corrupt input arguments or methods used return None, None is returned """ + if (not isinstance(prompt, str) or not isinstance(seq_len, int) + or not prompt or not seq_len): + return None + + encoded_prompt = self._text_processor.encode(prompt) + if encoded_prompt is None: + return None + + candidates = {encoded_prompt: 0.0} + for i in range(seq_len): + new_sequence_candidates = dict(candidates) + for sequence in candidates: + next_tokens = self._get_next_token(sequence) + if not next_tokens: + return None + + continued_candidates = (self.beam_searcher.continue_sequence( + sequence, next_tokens, new_sequence_candidates)) + if not continued_candidates: + break + + best_sequence_candidates = self.beam_searcher.prune_sequence_candidates( + new_sequence_candidates) + + if not best_sequence_candidates: + return None + sequence_candidates = best_sequence_candidates + + decoded = self._text_processor.decode(min(candidates, + key=lambda x: sequence_candidates[x])) + return decoded def _get_next_token( self, sequence_to_continue: tuple[int, ...] @@ -598,6 +630,13 @@ def _get_next_token( In case of corrupt input arguments return None. """ + if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0: + return None + + next_tokens = self.beam_searcher.get_next_token(sequence_to_continue) + if next_tokens is None: + return None + return next_tokens class NGramLanguageModelReader: From 45bcbe2c96f1499f6c1116743f098601077c7436 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 22 Nov 2023 22:19:03 +0300 Subject: [PATCH 58/81] mark 8 --- lab_3_generate_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_3_generate_by_ngrams/target_score.txt +++ b/lab_3_generate_by_ngrams/target_score.txt @@ -1 +1 @@ -6 +8 From 36860717fdbf24a95d09a27f053a750bf28b50df Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 22 Nov 2023 23:05:21 +0300 Subject: [PATCH 59/81] some changes --- lab_3_generate_by_ngrams/main.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index e9a8655cf..a27f35553 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -105,6 +105,7 @@ def get_token(self, element_id: int) -> Optional[str]: for key, value in self._storage.items(): if value == element_id: return key + return None def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -194,6 +195,7 @@ def fill_from_ngrams(self, content: dict) -> None: for element in key: if element.isalpha(): self._put(element) + return None def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: """ @@ -272,7 +274,6 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None: self._n_gram_frequencies = {} self._encoded_corpus = encoded_corpus - def get_n_gram_size(self) -> int: """ Retrieve value stored in self._n_gram_size attribute. @@ -292,6 +293,7 @@ def set_n_grams(self, frequencies: dict) -> None: if not isinstance(frequencies, dict) or len(frequencies) == 0: return None self._n_gram_frequencies = frequencies + return None def build(self) -> int: """ @@ -415,14 +417,14 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: candidates = self._model.generate_next_token(encoded) if not candidates: break - best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())] + best_candidate = ([letter for letter, freq in candidates.items() + if freq == max(candidates.values())]) max_freq_letters = sorted(best_candidate) encoded += (max_freq_letters[0],) seq_len -= 1 return self._text_processor.decode(encoded) - class BeamSearcher: """ Beam Search algorithm for diverse text generation. @@ -513,9 +515,8 @@ def continue_sequence( new_sequence_candidates[new_sequence] = new_probability if len(new_sequence_candidates) > self._beam_width: - new_sequence_candidates = dict( - sorted(new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width]) - + new_sequence_candidates = dict(sorted(new_sequence_candidates.items(), + key=lambda x: x[1], reverse=True)[:self._beam_width]) return new_sequence_candidates def prune_sequence_candidates( @@ -535,7 +536,8 @@ def prune_sequence_candidates( if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0: return None - sorted_candidates = dict(sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0]), reverse=True)) + sorted_candidates = dict(sorted(sequence_candidates.items(), + key=lambda x: (x[1], x[0]), reverse=True)) return dict(list(sorted_candidates.items())[:self._beam_width]) From a9a62ee156343361d29b777274ede34f392c931d Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 22 Nov 2023 23:15:08 +0300 Subject: [PATCH 60/81] added fixes --- lab_3_generate_by_ngrams/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index a27f35553..0536c6c89 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -641,6 +641,7 @@ def _get_next_token( return next_tokens + class NGramLanguageModelReader: """ Factory for loading language models ngrams from external JSON. From 03d9c2f86ab5e39650ccd3f42677db9a253fa79f Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 22 Nov 2023 23:26:35 +0300 Subject: [PATCH 61/81] added fixes --- lab_3_generate_by_ngrams/main.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 0536c6c89..767f77fd1 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -102,10 +102,8 @@ def get_token(self, element_id: int) -> Optional[str]: if not isinstance(element_id, int) or element_id not in self._storage.values(): return None - for key, value in self._storage.items(): - if value == element_id: - return key - return None + token = list(filter(lambda x: x[1] == element_id, self._storage.items())) + return token[0][0] def encode(self, text: str) -> Optional[tuple[int, ...]]: """ @@ -177,10 +175,11 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: decoded_tokens = self._decode(encoded_corpus) if not decoded_tokens: return None - decoded_text = self._postprocess_decoded_text(decoded_tokens) - if not decoded_text: + + if not self._postprocess_decoded_text(decoded_tokens): return None - return decoded_text + + return self._postprocess_decoded_text(decoded_tokens) def fill_from_ngrams(self, content: dict) -> None: """ @@ -515,8 +514,8 @@ def continue_sequence( new_sequence_candidates[new_sequence] = new_probability if len(new_sequence_candidates) > self._beam_width: - new_sequence_candidates = dict(sorted(new_sequence_candidates.items(), - key=lambda x: x[1], reverse=True)[:self._beam_width]) + new_sequence_candidates = dict(sorted( + new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width]) return new_sequence_candidates def prune_sequence_candidates( From fd83d60fc7980338e72deacc1eabe3e9a99efffb Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 23 Nov 2023 19:29:01 +0300 Subject: [PATCH 62/81] added fixes --- lab_3_generate_by_ngrams/main.py | 56 +++++++++++++------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 767f77fd1..155e7d552 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -366,7 +366,6 @@ def _extract_n_grams( list_encoded_corpus = list(encoded_corpus) for index in range(len(encoded_corpus) + 1 - self._n_gram_size): n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size])) - return tuple(n_grams) @@ -496,27 +495,18 @@ def continue_sequence( In case of corrupt input arguments or unexpected behaviour of methods used return None. """ - if (not isinstance(sequence, tuple) or not isinstance(next_tokens, list) or - not isinstance(sequence_candidates, dict) or not sequence): - return None - if (not next_tokens or not sequence_candidates or - sequence not in sequence_candidates or - len(next_tokens) > self._beam_width): + if not (isinstance(sequence, tuple) and isinstance(next_tokens, list) + and isinstance(sequence_candidates, dict) and sequence + and next_tokens and sequence_candidates and len(next_tokens) <= self._beam_width + and sequence in sequence_candidates): return None - new_sequence_candidates = {} - for key, value in sequence_candidates.items(): - if key != sequence: - probability = value - for token, token_probability in next_tokens: - new_sequence = key + (token,) - new_probability = probability + (-1) * math.log(token_probability) - new_sequence_candidates[new_sequence] = new_probability - - if len(new_sequence_candidates) > self._beam_width: - new_sequence_candidates = dict(sorted( - new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width]) - return new_sequence_candidates + for token_tuple in next_tokens: + new_sequence = sequence + (token_tuple[0],) + new_freq = sequence_candidates[sequence] - math.log(token_tuple[1]) + sequence_candidates[new_sequence] = new_freq + sequence_candidates.pop(sequence) + return sequence_candidates def prune_sequence_candidates( self, sequence_candidates: dict[tuple[int, ...], float] @@ -532,12 +522,11 @@ def prune_sequence_candidates( In case of corrupt input arguments return None. """ - if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0: + if not isinstance(sequence_candidates, dict) or not sequence_candidates: return None - sorted_candidates = dict(sorted(sequence_candidates.items(), - key=lambda x: (x[1], x[0]), reverse=True)) - return dict(list(sorted_candidates.items())[:self._beam_width]) + sorted_candidates = sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0])) + return dict(sorted_candidates[:self._beam_width]) class BeamSearchTextGenerator: @@ -601,20 +590,19 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]: return None continued_candidates = (self.beam_searcher.continue_sequence( - sequence, next_tokens, new_sequence_candidates)) + sequence, next_tokens, new_sequence_candidates)) if not continued_candidates: break - best_sequence_candidates = self.beam_searcher.prune_sequence_candidates( - new_sequence_candidates) + best_sequence = self.beam_searcher.prune_sequence_candidates( + new_sequence_candidates) - if not best_sequence_candidates: - return None - sequence_candidates = best_sequence_candidates - - decoded = self._text_processor.decode(min(candidates, - key=lambda x: sequence_candidates[x])) - return decoded + if best_sequence is None: + return None + candidates = best_sequence + best_candidate = sorted([candidate for candidate, probability in candidates.items() if + probability == min(candidates.values())])[0] + return self._text_processor.decode(best_candidate) def _get_next_token( self, sequence_to_continue: tuple[int, ...] From e85ad1d0bd15b7b1d2172904b554e3cdde48b2e4 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 23 Nov 2023 20:17:31 +0300 Subject: [PATCH 63/81] added fixes --- lab_3_generate_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 155e7d552..39c986fe8 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -169,7 +169,7 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: In case of corrupt input arguments, None is returned. In case any of methods used return None, None is returned. """ - if not isinstance(encoded_corpus, tuple) or not encoded_corpus: + if not isinstance(encoded_corpus, tuple): return None decoded_tokens = self._decode(encoded_corpus) From 8592c3bb0cc6c0dbdb3aceef7916edbfe1bed818 Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 23 Nov 2023 20:29:22 +0300 Subject: [PATCH 64/81] start --- lab_3_generate_by_ngrams/start.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py index 7eee1289d..187c185cb 100644 --- a/lab_3_generate_by_ngrams/start.py +++ b/lab_3_generate_by_ngrams/start.py @@ -1,7 +1,8 @@ """ Generation by NGrams starter """ -from lab_3_generate_by_ngrams.main import TextProcessor +from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, + NGramLanguageModel, TextProcessor) def main() -> None: @@ -12,12 +13,25 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - corpus = TextProcessor('_') - encoded_text = corpus.encode(text) - result = corpus.decode(encoded_text) - print(encoded_text) - print(result) - assert result + processor = TextProcessor('_') + encoded = processor.encode(text) + + if encoded: + result = processor.decode(encoded) + + print(result) + + model_for_build = NGramLanguageModel(encoded[:10], 2) + print(model_for_build.build()) + + model = NGramLanguageModel(encoded, 7) + greedy_text_generator = GreedyTextGenerator(model, processor) + print(greedy_text_generator.run(51, 'Vernon')) + + beam_search_generator = BeamSearchTextGenerator(model, processor, 7) + print(beam_search_generator.run('Vernon', 56)) + + assert result if __name__ == "__main__": From 9067d6d79170f8d484e920809040922fed55aa1d Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 23 Nov 2023 22:44:45 +0300 Subject: [PATCH 65/81] some changes --- lab_3_generate_by_ngrams/main.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 39c986fe8..419764d27 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -50,7 +50,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]: tokens = [] list_text = text.lower().split() for element in list_text: - word = [token for token in element if token.isalpha()] + word = list(filter(str.isalpha, element)) if word: tokens.extend(word) tokens.append(self._end_of_word_token) @@ -147,9 +147,7 @@ def _put(self, element: str) -> None: In case of corrupt input arguments or invalid argument length, an element is not added to storage """ - if not isinstance(element, str) or len(element) != 1: - return None - if element not in self._storage: + if isinstance(element, str) and len(element) == 1 and element not in self._storage: self._storage[element] = len(self._storage) return None @@ -190,10 +188,10 @@ def fill_from_ngrams(self, content: dict) -> None: """ if not isinstance(content, dict) or not content: return None - for key in content['freq']: - for element in key: - if element.isalpha(): - self._put(element) + for n_gram in content['freq']: + for token in n_gram: + if token.isalpha(): + self._put(token) return None def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]: @@ -243,12 +241,12 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional decoded_text = decoded_corpus[0].upper() for token in decoded_corpus[1:-1]: if token == self._end_of_word_token: - decoded_text += ' ' + decoded_text = f'{decoded_text} ' else: decoded_text += token if decoded_corpus[-1] != self._end_of_word_token: decoded_text += decoded_corpus[-1] - return decoded_text + '.' + return f'{decoded_text}.' class NGramLanguageModel: @@ -365,7 +363,8 @@ def _extract_n_grams( n_grams = [] list_encoded_corpus = list(encoded_corpus) for index in range(len(encoded_corpus) + 1 - self._n_gram_size): - n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size])) + n_gram = tuple(list_encoded_corpus[index: index + self._n_gram_size]) + n_grams.append(n_gram) return tuple(n_grams) @@ -415,8 +414,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: candidates = self._model.generate_next_token(encoded) if not candidates: break - best_candidate = ([letter for letter, freq in candidates.items() - if freq == max(candidates.values())]) + max_freq = max(candidates.values()) + best_candidate = list(filter(lambda x: candidates[x] == max_freq, candidates)) max_freq_letters = sorted(best_candidate) encoded += (max_freq_letters[0],) seq_len -= 1 From 3262c60e019420e5bba8165a363c4be621d2d7aa Mon Sep 17 00:00:00 2001 From: mmarina Date: Thu, 23 Nov 2023 23:04:51 +0300 Subject: [PATCH 66/81] changes --- lab_3_generate_by_ngrams/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py index 419764d27..834b9379e 100644 --- a/lab_3_generate_by_ngrams/main.py +++ b/lab_3_generate_by_ngrams/main.py @@ -149,7 +149,6 @@ def _put(self, element: str) -> None: """ if isinstance(element, str) and len(element) == 1 and element not in self._storage: self._storage[element] = len(self._storage) - return None def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]: """ @@ -415,7 +414,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]: if not candidates: break max_freq = max(candidates.values()) - best_candidate = list(filter(lambda x: candidates[x] == max_freq, candidates)) + best_candidate = ([letter for letter, freq in candidates.items() + if freq == max_freq]) max_freq_letters = sorted(best_candidate) encoded += (max_freq_letters[0],) seq_len -= 1 From 034b0f87367cddedbe2696c382a8cb37e13af381 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 29 Nov 2023 10:28:41 +0300 Subject: [PATCH 67/81] second step --- lab_4_fill_words_by_ngrams/main.py | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index b739ae182..8e061a918 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -28,6 +28,18 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(text, str) or len(text) == 0: + raise ValueError + text_words = text.lower().split() + + tokens = [] + for word in text_words: + if word[-1] in '!?.': + tokens.extend([word[:len(word) - 1], self._end_of_word_token]) + elif word.isalpha(): + tokens.append(word) + + return tuple(tokens) def _put(self, element: str) -> None: """ @@ -39,6 +51,13 @@ def _put(self, element: str) -> None: Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(element, str) or len(element) == 0: + raise ValueError + + if element not in self._storage: + self._storage[element] = len(self._storage) + + return None def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore """ @@ -56,6 +75,18 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + raise ValueError + + words_list = list(decoded_corpus) + sentences = (' '.join(words_list)).split('') + decoded_text = '' + for i, sentence in enumerate(sentences): + sentence = sentence.strip().capitalize() + decoded_text += f'{sentence}. ' + if decoded_corpus[-1] == '': + return decoded_text[:len(decoded_text) - 2].strip() + return decoded_text.strip() class TopPGenerator: From f3cd6f8af42e9227806840327221ea2d65ab23dd Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 6 Dec 2023 21:34:24 +0300 Subject: [PATCH 68/81] changes for 6 --- lab_4_fill_words_by_ngrams/main.py | 38 ++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 8e061a918..8fdbd38d1 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -4,6 +4,8 @@ Top-p sampling generation and filling gaps with ngrams """ # pylint:disable=too-few-public-methods, too-many-arguments +import random + from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) @@ -79,12 +81,12 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # raise ValueError words_list = list(decoded_corpus) - sentences = (' '.join(words_list)).split('') + sentences = (' '.join(words_list)).split(self._end_of_word_token) decoded_text = '' for i, sentence in enumerate(sentences): sentence = sentence.strip().capitalize() decoded_text += f'{sentence}. ' - if decoded_corpus[-1] == '': + if decoded_corpus[-1] == self._end_of_word_token: return decoded_text[:len(decoded_text) - 2].strip() return decoded_text.strip() @@ -111,6 +113,9 @@ def __init__( word_processor (WordProcessor): WordProcessor instance to handle text processing p_value (float): Collective probability mass threshold """ + self._model = language_model + self._word_processor = word_processor + self._p_value = p_value def run(self, seq_len: int, prompt: str) -> str: # type: ignore """ @@ -129,6 +134,35 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore or if sequence has inappropriate length, or if methods used return None. """ + if not (isinstance(seq_len, int) and seq_len > 0 + and isinstance(prompt, str) and prompt): + raise ValueError + encoded = self._word_processor.encode(prompt) + if not encoded: + raise ValueError + + for i in range(seq_len): + next_tokens = self._model.generate_next_token(encoded) + if next_tokens is None: + raise ValueError + if not next_tokens: + break + + sorted_dict = dict(sorted(list(next_tokens.items()), key=lambda x: (x[1], x[0]), reverse=True)) + probability = 0 + possible_tokens = () + for word, value in sorted_dict.items(): + probability += value + possible_tokens += (word,) + if probability >= self._p_value: + break + encoded += (random.choice(possible_tokens),) + + decoded = self._word_processor.decode(encoded) + if not decoded: + raise ValueError + + return decoded class GeneratorTypes: From 3e3f063a170558e22463ee977e31dc366a1e5240 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 6 Dec 2023 21:36:04 +0300 Subject: [PATCH 69/81] score 6 --- lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 573541ac9..1e8b31496 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -0 +6 From bc5e5536340c3fab9e4b47a4d8628db9b24a9794 Mon Sep 17 00:00:00 2001 From: mmarina Date: Wed, 6 Dec 2023 21:41:32 +0300 Subject: [PATCH 70/81] start --- lab_4_fill_words_by_ngrams/start.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index c41386377..cc77770a4 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,6 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import +from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor def main() -> None: @@ -10,7 +11,13 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - result = None + word_processor = WordProcessor('') + encoded_text = word_processor.encode(text) + model = NGramLanguageModel(encoded_text, 2) + model.build() + top_p = TopPGenerator(model, word_processor, 0.5) + result = top_p.run(51, 'Vernon') + print(result) assert result From 5098f1a9613c70d00d0e26a3b2f9b094f1b7f55a Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 12 Dec 2023 10:28:36 +0300 Subject: [PATCH 71/81] changes for 8 --- lab_4_fill_words_by_ngrams/main.py | 88 +++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 8fdbd38d1..552308fa6 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -5,6 +5,8 @@ """ # pylint:disable=too-few-public-methods, too-many-arguments import random +import math +import json from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) @@ -30,16 +32,17 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ - if not isinstance(text, str) or len(text) == 0: + if not isinstance(text, str) or not text: raise ValueError - text_words = text.lower().split() tokens = [] - for word in text_words: + for word in text.lower().split(): if word[-1] in '!?.': tokens.extend([word[:len(word) - 1], self._end_of_word_token]) - elif word.isalpha(): - tokens.append(word) + else: + cleaned_word = [letter for letter in word if letter.isalpha()] + if cleaned_word: + tokens.append(''.join(cleaned_word)) return tuple(tokens) @@ -134,8 +137,8 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore or if sequence has inappropriate length, or if methods used return None. """ - if not (isinstance(seq_len, int) and seq_len > 0 - and isinstance(prompt, str) and prompt): + if (not isinstance(seq_len, int) or not isinstance(prompt, str) + or seq_len <= 0): raise ValueError encoded = self._word_processor.encode(prompt) if not encoded: @@ -179,6 +182,9 @@ def __init__(self) -> None: """ Initialize an instance of GeneratorTypes. """ + self.greedy = 0 + self.top_p = 1 + self.beam_search = 2 def get_conversion_generator_type(self, generator_type: int) -> str: # type: ignore """ @@ -190,6 +196,8 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig Returns: (str): Name of the generator. """ + generators = ['Greedy Generator', 'Top-P Generator', 'Beam Search Generator'] + return generators[generator_type] class GenerationResultDTO: @@ -212,6 +220,9 @@ def __init__(self, text: str, perplexity: float, generation_type: int): generation_type (int): Numeric type of the generator for which perplexity was calculated """ + self.__text = text + self.__perplexity = perplexity + self.__type = generation_type def get_perplexity(self) -> float: # type: ignore """ @@ -220,6 +231,7 @@ def get_perplexity(self) -> float: # type: ignore Returns: (float): Perplexity value """ + return self.__perplexity def get_text(self) -> str: # type: ignore """ @@ -228,6 +240,7 @@ def get_text(self) -> str: # type: ignore Returns: (str): Text for which the perplexity was count """ + return self.__text def get_type(self) -> int: # type: ignore """ @@ -236,6 +249,7 @@ def get_type(self) -> int: # type: ignore Returns: (int): Numeric type of the generator """ + return self.__type def __str__(self) -> str: # type: ignore """ @@ -244,6 +258,9 @@ def __str__(self) -> str: # type: ignore Returns: (str): String with report """ + return (f'Perplexity score: {self.__perplexity}\n' + f'{GeneratorTypes().get_conversion_generator_type(self.__type)}\n' + f'Text: {self.__text}\n') class QualityChecker: @@ -268,6 +285,9 @@ def __init__( NGramLanguageModel instance to use for text generation word_processor (WordProcessor): WordProcessor instance to handle text processing """ + self._generators = generators + self._language_model = language_model + self._word_processor = word_processor def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore """ @@ -285,6 +305,27 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore or if methods used return None, or if nothing was generated. """ + if not isinstance(generated_text, str) or not generated_text: + raise ValueError + + encoded = self._word_processor.encode(generated_text) + if not encoded: + raise ValueError + + ngram_size = self._language_model.get_n_gram_size() + log_prob_sum = 0.0 + for index in range(ngram_size - 1, len(encoded)): + context = tuple(encoded[index - ngram_size + 1: index]) + next_tokens = self._language_model.generate_next_token(context) + if not next_tokens: + raise ValueError + + prob = next_tokens.get(encoded[index]) + if prob: + log_prob_sum += math.log(prob) + if not log_prob_sum: + raise ValueError + return math.exp(-log_prob_sum / (len(encoded) - ngram_size)) def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore """ @@ -304,6 +345,20 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: or if sequence has inappropriate length, or if methods used return None. """ + if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt: + raise ValueError("Incorrect input") + results = [] + for num_type, generator in self._generators.items(): + text = generator.run(prompt=prompt, seq_len=seq_len) + if not text: + raise ValueError + + perplexity = self._calculate_perplexity(text) + if not perplexity: + raise ValueError + + results.append(GenerationResultDTO(text, perplexity, num_type)) + return sorted(results, key=lambda item: (perplexity, num_type)) class Examiner: @@ -323,6 +378,8 @@ def __init__(self, json_path: str) -> None: Args: json_path (str): Local path to assets file """ + self._json_path = json_path + self._questions_and_answers = self._load_from_json() def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore """ @@ -338,6 +395,15 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore or if attribute _json_path has inappropriate extension, or if inappropriate type loaded data. """ + if (not isinstance(self._json_path, str) or not self._json_path + or self._json_path[-5:] != ".json"): + raise ValueError + + with open(self._json_path, 'r', encoding='utf-8') as file: + question_and_answers = json.load(file) + if not isinstance(question_and_answers, list): + raise ValueError + return {(i['question'], i['location']): i['answer'] for i in question_and_answers} def provide_questions(self) -> list[tuple[str, int]]: # type: ignore """ @@ -347,6 +413,7 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore list[tuple[str, int]]: List in the form of [(question, position of the word to be filled)] """ + return list(self._questions_and_answers.keys()) def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore """ @@ -361,6 +428,13 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ + if not isinstance(answers, dict) or not answers: + raise ValueError + + right_answers = ([key for key in self._questions_and_answers.keys() + if answers[key[0]] == self._questions_and_answers[key]]) + + return len(right_answers) / len(list(self._questions_and_answers.values())) class GeneratorRuleStudent: From b6cff90f63c64d325e0a9713737f2995d5eea007 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 12 Dec 2023 10:31:17 +0300 Subject: [PATCH 72/81] mark 8 --- lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 1e8b31496..45a4fb75d 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -6 +8 From 5fc6613e21f71deeb413c44d50cfb6dd41677979 Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 12 Dec 2023 12:43:16 +0300 Subject: [PATCH 73/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 552308fa6..812d4b8f3 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -4,9 +4,9 @@ Top-p sampling generation and filling gaps with ngrams """ # pylint:disable=too-few-public-methods, too-many-arguments -import random -import math import json +import math +import random from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator, NGramLanguageModel, TextProcessor) @@ -33,7 +33,7 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(text, str) or not text: - raise ValueError + raise ValueError('WordProcessor._tokenize: Incorrect input') tokens = [] for word in text.lower().split(): @@ -57,13 +57,11 @@ def _put(self, element: str) -> None: ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(element, str) or len(element) == 0: - raise ValueError + raise ValueError('WordProcessor._put: Incorrect input') if element not in self._storage: self._storage[element] = len(self._storage) - return None - def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore """ Convert decoded sentence into the string sequence. @@ -81,7 +79,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: - raise ValueError + raise ValueError('WordProcessor._postprocess_decoded_text: Incorrect input') words_list = list(decoded_corpus) sentences = (' '.join(words_list)).split(self._end_of_word_token) @@ -139,7 +137,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore """ if (not isinstance(seq_len, int) or not isinstance(prompt, str) or seq_len <= 0): - raise ValueError + raise ValueError("TopPGenerator.run: Incorrect input") encoded = self._word_processor.encode(prompt) if not encoded: raise ValueError @@ -151,7 +149,8 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore if not next_tokens: break - sorted_dict = dict(sorted(list(next_tokens.items()), key=lambda x: (x[1], x[0]), reverse=True)) + sorted_dict = dict(sorted(list(next_tokens.items()), + key=lambda x: (x[1], x[0]), reverse=True)) probability = 0 possible_tokens = () for word, value in sorted_dict.items(): @@ -306,7 +305,7 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore or if nothing was generated. """ if not isinstance(generated_text, str) or not generated_text: - raise ValueError + raise ValueError("QualityChecker._calculate_perplexity: Incorrect input") encoded = self._word_processor.encode(generated_text) if not encoded: @@ -346,7 +345,8 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: or if methods used return None. """ if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt: - raise ValueError("Incorrect input") + raise ValueError("QualityChecker.run: Incorrect input") + results = [] for num_type, generator in self._generators.items(): text = generator.run(prompt=prompt, seq_len=seq_len) @@ -397,7 +397,7 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore """ if (not isinstance(self._json_path, str) or not self._json_path or self._json_path[-5:] != ".json"): - raise ValueError + raise ValueError("Examiner._load_from_json: Incorrect input") with open(self._json_path, 'r', encoding='utf-8') as file: question_and_answers = json.load(file) @@ -429,7 +429,7 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(answers, dict) or not answers: - raise ValueError + raise ValueError("Examiner._load_from_json: Incorrect input") right_answers = ([key for key in self._questions_and_answers.keys() if answers[key[0]] == self._questions_and_answers[key]]) From ce220580b078118bcadf09321cd124322b15eaef Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 12 Dec 2023 14:15:53 +0300 Subject: [PATCH 74/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 812d4b8f3..1a205279d 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -140,7 +140,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore raise ValueError("TopPGenerator.run: Incorrect input") encoded = self._word_processor.encode(prompt) if not encoded: - raise ValueError + raise ValueError("TopPGenerator.run: Encoded is None") for i in range(seq_len): next_tokens = self._model.generate_next_token(encoded) @@ -152,7 +152,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore sorted_dict = dict(sorted(list(next_tokens.items()), key=lambda x: (x[1], x[0]), reverse=True)) probability = 0 - possible_tokens = () + possible_tokens = tuple() for word, value in sorted_dict.items(): probability += value possible_tokens += (word,) @@ -309,7 +309,7 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore encoded = self._word_processor.encode(generated_text) if not encoded: - raise ValueError + raise ValueError("QualityChecker._calculate_perplexity: Encoded is None") ngram_size = self._language_model.get_n_gram_size() log_prob_sum = 0.0 @@ -317,13 +317,13 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore context = tuple(encoded[index - ngram_size + 1: index]) next_tokens = self._language_model.generate_next_token(context) if not next_tokens: - raise ValueError + raise ValueError("QualityChecker._calculate_perplexity: Next_tokens is None") prob = next_tokens.get(encoded[index]) if prob: log_prob_sum += math.log(prob) if not log_prob_sum: - raise ValueError + raise ValueError("QualityChecker._calculate_perplexity: Log_prob_sum is None") return math.exp(-log_prob_sum / (len(encoded) - ngram_size)) def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore @@ -351,11 +351,11 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: for num_type, generator in self._generators.items(): text = generator.run(prompt=prompt, seq_len=seq_len) if not text: - raise ValueError + raise ValueError("QualityChecker.run: Text is None") perplexity = self._calculate_perplexity(text) if not perplexity: - raise ValueError + raise ValueError("QualityChecker.run: Perplexity is None") results.append(GenerationResultDTO(text, perplexity, num_type)) return sorted(results, key=lambda item: (perplexity, num_type)) @@ -402,7 +402,7 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore with open(self._json_path, 'r', encoding='utf-8') as file: question_and_answers = json.load(file) if not isinstance(question_and_answers, list): - raise ValueError + raise ValueError("Examiner._load_from_json: Question_and_answers is None") return {(i['question'], i['location']): i['answer'] for i in question_and_answers} def provide_questions(self) -> list[tuple[str, int]]: # type: ignore From f96b2ad9233ec4e4aba86c2b318a888bff31aa9f Mon Sep 17 00:00:00 2001 From: mmarina Date: Tue, 12 Dec 2023 14:28:30 +0300 Subject: [PATCH 75/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 1a205279d..3f50c33c5 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -149,7 +149,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore if not next_tokens: break - sorted_dict = dict(sorted(list(next_tokens.items()), + sorted_dict = dict(sorted(next_tokens.items(), key=lambda x: (x[1], x[0]), reverse=True)) probability = 0 possible_tokens = tuple() From 67eb3173c5fbbaba7edd7eccd7db0a0d32e48025 Mon Sep 17 00:00:00 2001 From: mmarina Date: Sat, 16 Dec 2023 23:37:04 +0300 Subject: [PATCH 76/81] changes for 10 --- lab_4_fill_words_by_ngrams/main.py | 62 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 3f50c33c5..daf4db8e8 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -33,11 +33,12 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(text, str) or not text: - raise ValueError('WordProcessor._tokenize: Incorrect input') + raise ValueError('Incorrect input') tokens = [] + punctuation = '!?.' for word in text.lower().split(): - if word[-1] in '!?.': + if word[-1] in punctuation: tokens.extend([word[:len(word) - 1], self._end_of_word_token]) else: cleaned_word = [letter for letter in word if letter.isalpha()] @@ -57,7 +58,7 @@ def _put(self, element: str) -> None: ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(element, str) or len(element) == 0: - raise ValueError('WordProcessor._put: Incorrect input') + raise ValueError('Incorrect input') if element not in self._storage: self._storage[element] = len(self._storage) @@ -79,7 +80,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: - raise ValueError('WordProcessor._postprocess_decoded_text: Incorrect input') + raise ValueError('Incorrect input') words_list = list(decoded_corpus) sentences = (' '.join(words_list)).split(self._end_of_word_token) @@ -137,10 +138,10 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore """ if (not isinstance(seq_len, int) or not isinstance(prompt, str) or seq_len <= 0): - raise ValueError("TopPGenerator.run: Incorrect input") + raise ValueError("Incorrect input") encoded = self._word_processor.encode(prompt) if not encoded: - raise ValueError("TopPGenerator.run: Encoded is None") + raise ValueError("Encoded is None") for i in range(seq_len): next_tokens = self._model.generate_next_token(encoded) @@ -195,8 +196,12 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig Returns: (str): Name of the generator. """ - generators = ['Greedy Generator', 'Top-P Generator', 'Beam Search Generator'] - return generators[generator_type] + types = { + self.greedy: 'Greedy Generator', + self.top_p: 'Top-P Generator', + self.beam_search: 'Beam Search Generator' + } + return types[generator_type] class GenerationResultDTO: @@ -305,11 +310,11 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore or if nothing was generated. """ if not isinstance(generated_text, str) or not generated_text: - raise ValueError("QualityChecker._calculate_perplexity: Incorrect input") + raise ValueError('Incorrect input') encoded = self._word_processor.encode(generated_text) if not encoded: - raise ValueError("QualityChecker._calculate_perplexity: Encoded is None") + raise ValueError('Encoded is None') ngram_size = self._language_model.get_n_gram_size() log_prob_sum = 0.0 @@ -317,13 +322,13 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore context = tuple(encoded[index - ngram_size + 1: index]) next_tokens = self._language_model.generate_next_token(context) if not next_tokens: - raise ValueError("QualityChecker._calculate_perplexity: Next_tokens is None") + raise ValueError('Next_tokens is None') prob = next_tokens.get(encoded[index]) if prob: log_prob_sum += math.log(prob) if not log_prob_sum: - raise ValueError("QualityChecker._calculate_perplexity: Log_prob_sum is None") + raise ValueError('Log_prob_sum is None') return math.exp(-log_prob_sum / (len(encoded) - ngram_size)) def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore @@ -345,17 +350,17 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: or if methods used return None. """ if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt: - raise ValueError("QualityChecker.run: Incorrect input") + raise ValueError('Incorrect input') results = [] for num_type, generator in self._generators.items(): text = generator.run(prompt=prompt, seq_len=seq_len) if not text: - raise ValueError("QualityChecker.run: Text is None") + raise ValueError('Text is None') perplexity = self._calculate_perplexity(text) if not perplexity: - raise ValueError("QualityChecker.run: Perplexity is None") + raise ValueError('Perplexity is None') results.append(GenerationResultDTO(text, perplexity, num_type)) return sorted(results, key=lambda item: (perplexity, num_type)) @@ -397,12 +402,12 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore """ if (not isinstance(self._json_path, str) or not self._json_path or self._json_path[-5:] != ".json"): - raise ValueError("Examiner._load_from_json: Incorrect input") + raise ValueError('Incorrect input') with open(self._json_path, 'r', encoding='utf-8') as file: question_and_answers = json.load(file) if not isinstance(question_and_answers, list): - raise ValueError("Examiner._load_from_json: Question_and_answers is None") + raise ValueError('Question_and_answers is None') return {(i['question'], i['location']): i['answer'] for i in question_and_answers} def provide_questions(self) -> list[tuple[str, int]]: # type: ignore @@ -429,7 +434,7 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore ValueError: In case of inappropriate type input argument or if input argument is empty. """ if not isinstance(answers, dict) or not answers: - raise ValueError("Examiner._load_from_json: Incorrect input") + raise ValueError('Incorrect input') right_answers = ([key for key in self._questions_and_answers.keys() if answers[key[0]] == self._questions_and_answers[key]]) @@ -457,6 +462,11 @@ def __init__( NGramLanguageModel instance to use for text generation word_processor (WordProcessor): WordProcessor instance to handle text processing """ + self._generator_type = generator_type + generators = (GreedyTextGenerator(language_model, word_processor), + TopPGenerator(language_model, word_processor, 0.5), + BeamSearchTextGenerator(language_model, word_processor, 5)) + self._generator = generators[self._generator_type] def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ignore """ @@ -474,6 +484,20 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig or if input argument is empty, or if methods used return None. """ + if not isinstance(tasks, list) or not tasks: + raise ValueError('Incorrect input') + + answers = {} + for (question, position) in tasks: + next_sequence = self._generator.run(seq_len=1, prompt=question[:position]) + if not next_sequence: + raise ValueError + + if next_sequence[-1] == '.': + next_sequence = next_sequence[:-1] + ' ' + answers.update({question: next_sequence + question[position:]}) + + return answers def get_generator_type(self) -> str: # type: ignore """ @@ -482,3 +506,5 @@ def get_generator_type(self) -> str: # type: ignore Returns: str: Generator type """ + generator = GeneratorTypes() + return generator.get_conversion_generator_type(self._generator_type) From 8399714958ee3149284b4eb5860722d4d2ca7c25 Mon Sep 17 00:00:00 2001 From: mmarina Date: Sat, 16 Dec 2023 23:39:10 +0300 Subject: [PATCH 77/81] mark 10 --- lab_4_fill_words_by_ngrams/target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt index 45a4fb75d..f599e28b8 100644 --- a/lab_4_fill_words_by_ngrams/target_score.txt +++ b/lab_4_fill_words_by_ngrams/target_score.txt @@ -1 +1 @@ -8 +10 From 165bf2677a625c0c05aa98b290dcf6b4401f0528 Mon Sep 17 00:00:00 2001 From: mmarina Date: Sun, 17 Dec 2023 11:23:31 +0300 Subject: [PATCH 78/81] start --- lab_4_fill_words_by_ngrams/start.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py index cc77770a4..eb45aac37 100644 --- a/lab_4_fill_words_by_ngrams/start.py +++ b/lab_4_fill_words_by_ngrams/start.py @@ -2,7 +2,7 @@ Filling word by ngrams starter """ # pylint:disable=too-many-locals,unused-import -from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor +import lab_4_fill_words_by_ngrams.main as main_py def main() -> None: @@ -11,13 +11,30 @@ def main() -> None: """ with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file: text = text_file.read() - word_processor = WordProcessor('') + word_processor = main_py.WordProcessor('') encoded_text = word_processor.encode(text) - model = NGramLanguageModel(encoded_text, 2) + model = main_py.NGramLanguageModel(encoded_text, 2) model.build() - top_p = TopPGenerator(model, word_processor, 0.5) - result = top_p.run(51, 'Vernon') - print(result) + top_p = main_py.TopPGenerator(model, word_processor, 0.5) + top_p_result = top_p.run(51, 'Vernon') + print(top_p_result) + generator_types = main_py.GeneratorTypes() + generators = {generator_types.top_p: main_py.TopPGenerator(model, word_processor, 0.5), + generator_types.beam_search: + main_py.BeamSearchTextGenerator(model, word_processor, 5)} + quality_check = main_py.QualityChecker(generators, model, word_processor) + quality_result = quality_check.run(100, 'The') + print(quality_result) + examiner = main_py.Examiner('./assets/question_and_answers.json') + questions = examiner.provide_questions() + students = [main_py.GeneratorRuleStudent(i, model, word_processor) for i in range(3)] + for student in students: + answers = student.take_exam(questions) + result = examiner.assess_exam(answers) + generator_type = student.get_generator_type() + print('Type of generator:', generator_type) + print('Answers:', ''.join(answers.values())) + print('Accuracy:', result) assert result From b713d63d536e22f7cf820c89962769c3ac03e3c0 Mon Sep 17 00:00:00 2001 From: mmarina Date: Sun, 17 Dec 2023 12:08:26 +0300 Subject: [PATCH 79/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index daf4db8e8..f98a347aa 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -185,6 +185,11 @@ def __init__(self) -> None: self.greedy = 0 self.top_p = 1 self.beam_search = 2 + self._types = { + self.greedy: 'Greedy Generator', + self.top_p: 'Top-P Generator', + self.beam_search: 'Beam Search Generator' + } def get_conversion_generator_type(self, generator_type: int) -> str: # type: ignore """ @@ -196,12 +201,7 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig Returns: (str): Name of the generator. """ - types = { - self.greedy: 'Greedy Generator', - self.top_p: 'Top-P Generator', - self.beam_search: 'Beam Search Generator' - } - return types[generator_type] + return self._types[generator_type] class GenerationResultDTO: @@ -491,7 +491,7 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig for (question, position) in tasks: next_sequence = self._generator.run(seq_len=1, prompt=question[:position]) if not next_sequence: - raise ValueError + raise ValueError('Next sequence is None') if next_sequence[-1] == '.': next_sequence = next_sequence[:-1] + ' ' From 0e7f98f7e5ddb264160cbb2f079b8e203ff1901f Mon Sep 17 00:00:00 2001 From: mmarina Date: Sun, 17 Dec 2023 12:10:12 +0300 Subject: [PATCH 80/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index f98a347aa..07763cca9 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -163,7 +163,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore decoded = self._word_processor.decode(encoded) if not decoded: - raise ValueError + raise ValueError('Decoded is None') return decoded From a48fe2620316e8ba0175fe2670cccbce8d4f0590 Mon Sep 17 00:00:00 2001 From: mmarina Date: Mon, 18 Dec 2023 15:06:38 +0300 Subject: [PATCH 81/81] corrections --- lab_4_fill_words_by_ngrams/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py index 07763cca9..daa2d8d86 100644 --- a/lab_4_fill_words_by_ngrams/main.py +++ b/lab_4_fill_words_by_ngrams/main.py @@ -57,7 +57,7 @@ def _put(self, element: str) -> None: Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ - if not isinstance(element, str) or len(element) == 0: + if not isinstance(element, str) or not element: raise ValueError('Incorrect input') if element not in self._storage: @@ -79,7 +79,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # Raises: ValueError: In case of inappropriate type input argument or if input argument is empty. """ - if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0: + if not isinstance(decoded_corpus, tuple) or not decoded_corpus: raise ValueError('Incorrect input') words_list = list(decoded_corpus) @@ -146,7 +146,7 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore for i in range(seq_len): next_tokens = self._model.generate_next_token(encoded) if next_tokens is None: - raise ValueError + raise ValueError('Next tokens are None') if not next_tokens: break