From 2b71dbe60816a1b2062fa41238c0e52b2abcf3fc Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 15 Sep 2023 10:45:01 +0300
Subject: [PATCH 01/81] add main

---
 lab_1_classify_by_unigrams/target_score.txt |  2 +-
 requirements.txt                            | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index 573541ac9..f599e28b8 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-0
+10
diff --git a/requirements.txt b/requirements.txt
index 8b1378917..2c7efb1e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,16 @@
-
+ast-comments==1.0.1
+black==22.6.0
+coverage[toml]==6.4.4
+ghapi==0.1.19
+flake8==6.0.0
+flake8-isort==6.0.0
+mypy==1.1.1
+pymarkdownlnt==0.9.9
+pymdown-extensions==9.5
+pydantic==1.10.7
+pylint==2.15.10
+pyspelling==2.7.3
+pytest==6.2.5
+regex==2023.3.23
+typed-argument-parser==1.8.1
+tqdm==4.64.1

From be684a1883f1ef85b141bf9208009bf020619510 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 15 Sep 2023 12:54:45 +0300
Subject: [PATCH 02/81] add git

---
 new attempt.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 new attempt.py

diff --git a/new attempt.py b/new attempt.py
new file mode 100644
index 000000000..3db933357
--- /dev/null
+++ b/new attempt.py	
@@ -0,0 +1 @@
+print(5)

From b6368f8d0a3b82a494157f74c1b8ae8e67eeed35 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 20 Sep 2023 12:25:59 +0300
Subject: [PATCH 03/81] git commit

---
 lab_1_classify_by_unigrams/main.py | 39 +++++++++++++++---------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 486b3d65c..195af3fd6 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -2,32 +2,33 @@
 Lab 1
 Language detection
 """
-
-
 def tokenize(text: str) -> list[str] | None:
-    """
-    Splits a text into tokens, converts the tokens into lowercase,
-    removes punctuation, digits and other symbols
-    :param text: a text
-    :return: a list of lower-cased tokens without punctuation
-    """
+    text = text.lower()
+    cleaned_text = ""
+    for symbol in text:
+        for letter in symbol:
+            if letter.isalpha() and symbol != " ":
+                cleaned_text += letter
+    tokens = list(cleaned_text)
+    if not isinstance(text, str):
+        return None
+    return tokens
 
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
-    """
-    Calculates frequencies of given tokens
-    :param tokens: a list of tokens
-    :return: a dictionary with frequencies
-    """
+    freqs = {}
+    element_count = len(tokens)
+    for token in tokens:
+        if token in freqs:
+            freqs[token] += 1 / element_count
+        else:
+            freqs[token] = 1 / element_count
+    return freqs
 
 
 def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
-    """
-    Creates a language profile
-    :param language: a language
-    :param text: a text
-    :return: a dictionary with two keys – name, freq
-    """
+    dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))}
+    return dict_language_profile
 
 
 def calculate_mse(predicted: list, actual: list) -> float | None:

From 323fc7c4e3a9b5d5b8ef569eca1d6f8fcae1608c Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 21 Sep 2023 19:44:39 +0300
Subject: [PATCH 04/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 23 ++++++++++++-----------
 new attempt.py                     |  1 -
 2 files changed, 12 insertions(+), 12 deletions(-)
 delete mode 100644 new attempt.py

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 195af3fd6..20c57d60c 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -3,26 +3,27 @@
 Language detection
 """
 def tokenize(text: str) -> list[str] | None:
-    text = text.lower()
-    cleaned_text = ""
-    for symbol in text:
-        for letter in symbol:
-            if letter.isalpha() and symbol != " ":
-                cleaned_text += letter
-    tokens = list(cleaned_text)
     if not isinstance(text, str):
         return None
-    return tokens
+    else:
+        text = text.lower()
+        cleaned_text = ""
+        for symbol in text:
+            if symbol.isalpha() and symbol != " ":
+                cleaned_text += symbol
+        tokens = list(cleaned_text)
+        return tokens
 
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     freqs = {}
-    element_count = len(tokens)
     for token in tokens:
         if token in freqs:
-            freqs[token] += 1 / element_count
+            freqs[token] += 1
         else:
-            freqs[token] = 1 / element_count
+            freqs[token] = 1
+    for token, freq in freqs:
+        freqs[token] = freq / len(tokens)
     return freqs
 
 
diff --git a/new attempt.py b/new attempt.py
deleted file mode 100644
index 3db933357..000000000
--- a/new attempt.py	
+++ /dev/null
@@ -1 +0,0 @@
-print(5)

From 01ea1a1de204f7b12b1f95ad1a3bd993ddcd8213 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 27 Sep 2023 21:43:44 +0300
Subject: [PATCH 05/81] add fixes

---
 lab_1_classify_by_unigrams/main.py | 43 +++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index e91bed965..a368c6819 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -22,7 +22,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
             freqs[token] += 1
         else:
             freqs[token] = 1
-    for token, freq in freqs:
+    for token, freq in freqs.items():
         freqs[token] = freq / len(tokens)
     return freqs
 
@@ -39,6 +39,17 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     :param actual: a list of actual values
     :return: the score
     """
+    count_actual = len(actual)
+    count_predicted = len(predicted)
+    summ_values = 0
+    if isinstance(actual, list) and isinstance(predicted, list) and count_actual == count_predicted:
+        squared_difference = [(actual_value - predicted_value)**2 for actual_value, predicted_value in zip(actual,predicted)]
+        for value in squared_difference:
+            summ_values += value
+        mse = round(summ_values / count_actual, 3)
+        return mse
+    else:
+        return None
 
 
 def compare_profiles(
@@ -51,6 +62,23 @@ def compare_profiles(
     :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
     :return: the distance between the profiles
     """
+    if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict):
+        values_unknown_profile = unknown_profile['freq']
+        values_profile_to_compare = profile_to_compare['freq']
+        for letter in values_unknown_profile:
+            if letter not in values_profile_to_compare:
+                values_profile_to_compare[letter] = 0
+        for letter in values_profile_to_compare:
+            if letter not in values_unknown_profile:
+                values_unknown_profile[letter] = 0
+        sorted_unknown_profile = dict(sorted(values_unknown_profile.items()))
+        sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items()))
+        list_unknown_profile = list(sorted_unknown_profile.values())
+        list_profile_to_compare = list(sorted_profile_to_compare.values())
+        profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare)
+        return profile_difference
+    else:
+        return None
 
 
 def detect_language(
@@ -65,6 +93,19 @@ def detect_language(
     :param profile_2: a dictionary of a known profile
     :return: a language
     """
+    if isinstance(unknown_profile, dict) and isinstance(profile_1, dict) and isinstance(profile_2, dict):
+        mse_profile_1 = compare_profiles(unknown_profile, profile_1)
+        mse_profile_2 = compare_profiles(unknown_profile, profile_2)
+        if mse_profile_1 < mse_profile_2:
+            return profile_1['name']
+        elif mse_profile_2 < mse_profile_1:
+            return profile_2['name']
+        else:
+            str_name_language = sorted(profile_1['name'] + profile_2['name'])
+            first_name = str_name_language[0]
+            return first_name
+    else:
+        return None
 
 
 def load_profile(path_to_file: str) -> dict | None:

From 05842a0d87e025e380a01261b21cbada3b1ff660 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 29 Sep 2023 09:48:25 +0300
Subject: [PATCH 06/81] i start do lab

---
 lab_1_classify_by_unigrams/start.py         | 9 ++++++---
 lab_1_classify_by_unigrams/target_score.txt | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index db7a1a904..fcab139a4 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -1,19 +1,22 @@
 """
 Language detection starter
 """
-
-
+from lab_1_classify_by_unigrams.main import create_language_profile
+from lab_1_classify_by_unigrams.main import detect_language
 def main() -> None:
     """
     Launches an implementation
     """
     with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
         en_text = file_to_read_en.read()
+        en_profile = create_language_profile("en", en_text)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
+        de_profile = create_language_profile("de", de_text)
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
-    result = None
+        unknown_profile = create_language_profile("unk", unknown_text)
+    result = detect_language(unknown_profile, en_profile, de_profile)
     assert result, "Detection result is None"
 
 
diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index f599e28b8..45a4fb75d 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-10
+8

From 5604898e0667a927720b14b7265ed312ab0c4388 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 29 Sep 2023 09:48:35 +0300
Subject: [PATCH 07/81] i start do lab

---
 requirements.txt | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2c7efb1e5..e69de29bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +0,0 @@
-ast-comments==1.0.1
-black==22.6.0
-coverage[toml]==6.4.4
-ghapi==0.1.19
-flake8==6.0.0
-flake8-isort==6.0.0
-mypy==1.1.1
-pymarkdownlnt==0.9.9
-pymdown-extensions==9.5
-pydantic==1.10.7
-pylint==2.15.10
-pyspelling==2.7.3
-pytest==6.2.5
-regex==2023.3.23
-typed-argument-parser==1.8.1
-tqdm==4.64.1

From 9af8720803e32981d996dee6252eecf597c581db Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Mon, 2 Oct 2023 22:34:12 +0300
Subject: [PATCH 08/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 114 +++++++++++++++++------------
 1 file changed, 68 insertions(+), 46 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index a368c6819..e34346033 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -2,20 +2,35 @@
 Lab 1
 Language detection
 """
+
+
 def tokenize(text: str) -> list[str] | None:
+    """
+        Splits a text into tokens, converts the tokens into lowercase,
+        removes punctuation, digits and other symbols
+        :param text: a text
+        :return: a list of lower-cased tokens without punctuation
+        """
     if not isinstance(text, str):
         return None
-    else:
-        text = text.lower()
-        cleaned_text = ""
-        for symbol in text:
-            if symbol.isalpha() and symbol != " ":
-                cleaned_text += symbol
-        tokens = list(cleaned_text)
-        return tokens
+    cleaned_text = []
+    for symbol in text:
+        if symbol.isalpha():
+            cleaned_text.append(symbol.lower())
+    return cleaned_text
 
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
+    """
+       Calculates frequencies of given tokens
+       :param tokens: a list of tokens
+       :return: a dictionary with frequencies
+       """
+    if not isinstance(tokens, list):
+        return None
+    for token in tokens:
+        if not isinstance(token, str):
+            return None
     freqs = {}
     for token in tokens:
         if token in freqs:
@@ -28,6 +43,14 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
 
 
 def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
+    """
+        Creates a language profile
+        :param language: a language
+        :param text: a text
+        :return: a dictionary with two keys – name, freq
+        """
+    if not isinstance(language, str) or not isinstance(text, str):
+        return None
     dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))}
     return dict_language_profile
 
@@ -39,17 +62,13 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     :param actual: a list of actual values
     :return: the score
     """
-    count_actual = len(actual)
-    count_predicted = len(predicted)
-    summ_values = 0
-    if isinstance(actual, list) and isinstance(predicted, list) and count_actual == count_predicted:
-        squared_difference = [(actual_value - predicted_value)**2 for actual_value, predicted_value in zip(actual,predicted)]
-        for value in squared_difference:
-            summ_values += value
-        mse = round(summ_values / count_actual, 3)
-        return mse
-    else:
+    if not isinstance(actual, list) or not isinstance(predicted, list) or len(actual) != len(predicted):
         return None
+    summ_values = 0
+    for i, predicted_value in enumerate(predicted):
+        summ_values += (actual[i] - predicted_value)**2
+    mse = round(summ_values / len(actual), 4)
+    return mse
 
 
 def compare_profiles(
@@ -62,23 +81,25 @@ def compare_profiles(
     :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
     :return: the distance between the profiles
     """
-    if isinstance(unknown_profile, dict) and isinstance(profile_to_compare, dict):
-        values_unknown_profile = unknown_profile['freq']
-        values_profile_to_compare = profile_to_compare['freq']
-        for letter in values_unknown_profile:
-            if letter not in values_profile_to_compare:
-                values_profile_to_compare[letter] = 0
-        for letter in values_profile_to_compare:
-            if letter not in values_unknown_profile:
-                values_unknown_profile[letter] = 0
-        sorted_unknown_profile = dict(sorted(values_unknown_profile.items()))
-        sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items()))
-        list_unknown_profile = list(sorted_unknown_profile.values())
-        list_profile_to_compare = list(sorted_profile_to_compare.values())
-        profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare)
-        return profile_difference
-    else:
+    if (not isinstance(unknown_profile, dict) or
+            not isinstance(profile_to_compare, dict) or
+            'name' not in unknown_profile or
+            'name' not in profile_to_compare):
         return None
+    values_unknown_profile = unknown_profile['freq']
+    values_profile_to_compare = profile_to_compare['freq']
+    for letter in values_unknown_profile:
+        if letter not in values_profile_to_compare:
+            values_profile_to_compare[letter] = 0
+    for letter in values_profile_to_compare:
+        if letter not in values_unknown_profile:
+            values_unknown_profile[letter] = 0
+    sorted_unknown_profile = dict(sorted(values_unknown_profile.items()))
+    sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items()))
+    list_unknown_profile = list(sorted_unknown_profile.values())
+    list_profile_to_compare = list(sorted_profile_to_compare.values())
+    profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare)
+    return profile_difference
 
 
 def detect_language(
@@ -93,19 +114,20 @@ def detect_language(
     :param profile_2: a dictionary of a known profile
     :return: a language
     """
-    if isinstance(unknown_profile, dict) and isinstance(profile_1, dict) and isinstance(profile_2, dict):
-        mse_profile_1 = compare_profiles(unknown_profile, profile_1)
-        mse_profile_2 = compare_profiles(unknown_profile, profile_2)
-        if mse_profile_1 < mse_profile_2:
-            return profile_1['name']
-        elif mse_profile_2 < mse_profile_1:
-            return profile_2['name']
-        else:
-            str_name_language = sorted(profile_1['name'] + profile_2['name'])
-            first_name = str_name_language[0]
-            return first_name
-    else:
+    if (not isinstance(unknown_profile, dict) or
+            not isinstance(profile_1, dict) or
+            not isinstance(profile_2, dict)):
         return None
+    mse_profile_1 = compare_profiles(unknown_profile, profile_1)
+    mse_profile_2 = compare_profiles(unknown_profile, profile_2)
+    if mse_profile_1 < mse_profile_2:
+        return profile_1['name']
+    if mse_profile_2 < mse_profile_1:
+        return profile_2['name']
+    else:
+        str_name_language = sorted(profile_1['name'] + profile_2['name'])
+        first_name = str_name_language[0]
+        return first_name
 
 
 def load_profile(path_to_file: str) -> dict | None:

From 4c329e5378500ad1f53b4779ba00e7ddc3c68a55 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 09:16:03 +0300
Subject: [PATCH 09/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index e34346033..068a46608 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -62,7 +62,9 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     :param actual: a list of actual values
     :return: the score
     """
-    if not isinstance(actual, list) or not isinstance(predicted, list) or len(actual) != len(predicted):
+    if (not isinstance(actual, list) or
+            not isinstance(predicted, list) or
+            len(actual) != len(predicted)):
         return None
     summ_values = 0
     for i, predicted_value in enumerate(predicted):
@@ -120,14 +122,15 @@ def detect_language(
         return None
     mse_profile_1 = compare_profiles(unknown_profile, profile_1)
     mse_profile_2 = compare_profiles(unknown_profile, profile_2)
-    if mse_profile_1 < mse_profile_2:
-        return profile_1['name']
-    if mse_profile_2 < mse_profile_1:
-        return profile_2['name']
-    else:
-        str_name_language = sorted(profile_1['name'] + profile_2['name'])
-        first_name = str_name_language[0]
-        return first_name
+    if (isinstance(mse_profile_1, float)
+            and isinstance(mse_profile_2, float)):
+        if mse_profile_1 < mse_profile_2:
+            return profile_1['name']
+        if mse_profile_2 < mse_profile_1:
+            return profile_2['name']
+    str_name_language = sorted(profile_1['name'] + profile_2['name'])
+    first_name = str_name_language[0]
+    return first_name
 
 
 def load_profile(path_to_file: str) -> dict | None:

From 78e5f4a82b4c40b865584762f79d27dabceae5b5 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 09:24:29 +0300
Subject: [PATCH 10/81] added fixes in start

---
 lab_1_classify_by_unigrams/start.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index fcab139a4..e80b67f86 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -1,24 +1,27 @@
 """
 Language detection starter
 """
-from lab_1_classify_by_unigrams.main import create_language_profile
-from lab_1_classify_by_unigrams.main import detect_language
+from lab_1_classify_by_unigrams.main import create_language_profile, detect_language
+
+
 def main() -> None:
     """
     Launches an implementation
     """
     with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
         en_text = file_to_read_en.read()
-        en_profile = create_language_profile("en", en_text)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
-        de_profile = create_language_profile("de", de_text)
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
-        unknown_profile = create_language_profile("unk", unknown_text)
-    result = detect_language(unknown_profile, en_profile, de_profile)
-    assert result, "Detection result is None"
+    en_profile = create_language_profile('en', en_text)
+    de_profile = create_language_profile('de', de_text)
+    unknown_profile = create_language_profile('unknown', unknown_text)
+    if (isinstance(en_profile, dict) and isinstance(de_profile, dict) and
+            isinstance(unknown_profile, dict)):
+        result = detect_language(unknown_profile, en_profile, de_profile)
+        assert result, "Detection result is None"
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From b8f85b9738c47f1027de430b8e04a397d5029b16 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 10:08:25 +0300
Subject: [PATCH 11/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 45 +++++++++++-------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 068a46608..eba79d11c 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -51,8 +51,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st
         """
     if not isinstance(language, str) or not isinstance(text, str):
         return None
-    dict_language_profile = {"name": language, "freq": calculate_frequencies(tokenize(text))}
-    return dict_language_profile
+    return {"name": language, "freq": calculate_frequencies(tokenize(text))}
 
 
 def calculate_mse(predicted: list, actual: list) -> float | None:
@@ -62,8 +61,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     :param actual: a list of actual values
     :return: the score
     """
-    if (not isinstance(actual, list) or
-            not isinstance(predicted, list) or
+    if (not isinstance(actual, list) or not isinstance(predicted, list) or
             len(actual) != len(predicted)):
         return None
     summ_values = 0
@@ -83,25 +81,17 @@ def compare_profiles(
     :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
     :return: the distance between the profiles
     """
-    if (not isinstance(unknown_profile, dict) or
-            not isinstance(profile_to_compare, dict) or
-            'name' not in unknown_profile or
-            'name' not in profile_to_compare):
+    if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or
+            'name' not in unknown_profile or 'name' not in profile_to_compare):
         return None
-    values_unknown_profile = unknown_profile['freq']
-    values_profile_to_compare = profile_to_compare['freq']
-    for letter in values_unknown_profile:
-        if letter not in values_profile_to_compare:
-            values_profile_to_compare[letter] = 0
-    for letter in values_profile_to_compare:
-        if letter not in values_unknown_profile:
-            values_unknown_profile[letter] = 0
-    sorted_unknown_profile = dict(sorted(values_unknown_profile.items()))
-    sorted_profile_to_compare = dict(sorted(values_profile_to_compare.items()))
-    list_unknown_profile = list(sorted_unknown_profile.values())
-    list_profile_to_compare = list(sorted_profile_to_compare.values())
-    profile_difference = calculate_mse(list_unknown_profile, list_profile_to_compare)
-    return profile_difference
+    tokens = set(profile_to_compare['freq'].keys())
+    tokens.update(unknown_profile['freq'].keys())
+    list_unknown_profile = []
+    list_profile_to_compare = []
+    for letter in tokens:
+        list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0))
+        list_unknown_profile.append(unknown_profile['freq'].get(letter, 0))
+    return calculate_mse(list_profile_to_compare, list_unknown_profile)
 
 
 def detect_language(
@@ -116,8 +106,7 @@ def detect_language(
     :param profile_2: a dictionary of a known profile
     :return: a language
     """
-    if (not isinstance(unknown_profile, dict) or
-            not isinstance(profile_1, dict) or
+    if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or
             not isinstance(profile_2, dict)):
         return None
     mse_profile_1 = compare_profiles(unknown_profile, profile_1)
@@ -125,12 +114,10 @@ def detect_language(
     if (isinstance(mse_profile_1, float)
             and isinstance(mse_profile_2, float)):
         if mse_profile_1 < mse_profile_2:
-            return profile_1['name']
+            return str(profile_1['name'])
         if mse_profile_2 < mse_profile_1:
-            return profile_2['name']
-    str_name_language = sorted(profile_1['name'] + profile_2['name'])
-    first_name = str_name_language[0]
-    return first_name
+            return str(profile_2['name'])
+    return sorted([str(profile_1['name']), str(profile_2['name'])])[0]
 
 
 def load_profile(path_to_file: str) -> dict | None:

From b9f756748cf49cba08f2f56fbfaa475336648a08 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 10:10:10 +0300
Subject: [PATCH 12/81] added fixes in start

---
 lab_1_classify_by_unigrams/start.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index e80b67f86..9e9930ece 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -24,4 +24,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 85ff8e27fa443e87c3eddc32a0ab4486f3fe911b Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 10:16:29 +0300
Subject: [PATCH 13/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index eba79d11c..2fe7aa8a7 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -51,7 +51,10 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st
         """
     if not isinstance(language, str) or not isinstance(text, str):
         return None
-    return {"name": language, "freq": calculate_frequencies(tokenize(text))}
+    values_freq = calculate_frequencies(tokenize(text))
+    if not isinstance(values_freq, dict):
+        return None
+    return {'name': language, 'freq': values_freq}
 
 
 def calculate_mse(predicted: list, actual: list) -> float | None:

From c0b809ffbb6b3c691fd6606f3962fbcfbaca99e1 Mon Sep 17 00:00:00 2001
From: mmarina2004 <134407899+mmarina2004@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:49:01 +0300
Subject: [PATCH 14/81] Delete requirements.txt

delete requirements
---
 requirements.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index e69de29bb..000000000

From 9586a872610238ec24059cc8fd55703a613850b6 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 3 Oct 2023 14:09:45 +0300
Subject: [PATCH 15/81] recovery

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index e69de29bb..8b1378917 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
+

From 72e3aff37f134850292fec72f55d4cefa989a6ad Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 4 Oct 2023 12:09:40 +0300
Subject: [PATCH 16/81] added fixes for 10

---
 lab_1_classify_by_unigrams/main.py | 44 ++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 2fe7aa8a7..8a997343a 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -3,6 +3,8 @@
 Language detection
 """
 
+import json
+
 
 def tokenize(text: str) -> list[str] | None:
     """
@@ -129,6 +131,13 @@ def load_profile(path_to_file: str) -> dict | None:
     :param path_to_file: a path to the language profile
     :return: a dictionary with at least two keys – name, freq
     """
+    if not isinstance(path_to_file, str):
+        return None
+    with open(path_to_file, "r", encoding="utf-8") as json_file:
+        language_profile = json.load(json_file)
+    if not isinstance(language_profile, dict):
+        return None
+    return language_profile
 
 
 def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
@@ -138,6 +147,19 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
     :return: a dict with a lower-cased loaded profile
     with relative frequencies without unnecessary ngrams
     """
+    if (not isinstance(profile, dict) or 'name' not in profile
+            or 'freq' not in profile or 'n_words' not in profile):
+        return None
+    n_words = profile.pop('n_words')
+    new_freq = {}
+    for key, value in profile['freq'].items():
+        if key.isalpha() and len(key) == 1:
+            if key.lower() not in new_freq:
+                new_freq[key.lower()] = value / n_words[0]
+            else:
+                new_freq[key.lower()] += value / n_words[0]
+    processed_profile = {'name': profile['name'], 'freq': new_freq}
+    return processed_profile
 
 
 def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None:
@@ -146,6 +168,17 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str,
     :paths_to_profiles: a list of strings to the profiles
     :return: a list of loaded profiles
     """
+    if not isinstance(paths_to_profiles, list):
+        return None
+    list_processed_profiles = []
+    for paths in paths_to_profiles:
+        if isinstance(paths, str):
+            language_profile = load_profile(paths)
+            if isinstance(language_profile, dict):
+                processed_profile = preprocess_profile(language_profile)
+                if isinstance(processed_profile, dict):
+                    list_processed_profiles.append(processed_profile)
+    return list_processed_profiles
 
 
 def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]],
@@ -156,6 +189,14 @@ def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]],
     :param known_profiles: a list of known profiles
     :return: a sorted list of tuples containing a language and a distance
     """
+    if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list):
+        return None
+    list_mse = []
+    for profile in known_profiles:
+        if isinstance(profile, dict):
+            list_mse.append((profile['name'], compare_profiles(unknown_profile, profile)))
+    list_mse.sort(key=lambda a: (a[1], a[0]))
+    return list_mse
 
 
 def print_report(detections: list[tuple[str, float]]) -> None:
@@ -163,3 +204,6 @@ def print_report(detections: list[tuple[str, float]]) -> None:
     Prints report for detection of language
     :param detections: a list with distances for each available language
     """
+    if isinstance(detections, list):
+        for profile in detections:
+            print(f'{profile[0]}: MSE {profile[1]:.5f}')

From e90202725bb146e2d19bca918779c80a609772ba Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 4 Oct 2023 12:16:23 +0300
Subject: [PATCH 17/81] mark 10

---
 lab_1_classify_by_unigrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index 45a4fb75d..f599e28b8 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-8
+10

From 81a86ba58ddc6b50a25049f3be44277515784fbe Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 4 Oct 2023 22:52:06 +0300
Subject: [PATCH 18/81] added fixes for 10

---
 lab_1_classify_by_unigrams/main.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 8a997343a..d6c888daf 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -150,15 +150,13 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
     if (not isinstance(profile, dict) or 'name' not in profile
             or 'freq' not in profile or 'n_words' not in profile):
         return None
-    n_words = profile.pop('n_words')
     new_freq = {}
     for key, value in profile['freq'].items():
-        if key.isalpha() and len(key) == 1:
-            if key.lower() not in new_freq:
-                new_freq[key.lower()] = value / n_words[0]
-            else:
-                new_freq[key.lower()] += value / n_words[0]
-    processed_profile = {'name': profile['name'], 'freq': new_freq}
+        if key.lower() in new_freq:
+            new_freq[key.lower()] += value / profile["n_words"][0]
+        elif len(key) == 1:
+            new_freq[key.lower()] = value / profile["n_words"][0]
+    processed_profile = {'name': profile.get("name"), 'freq': new_freq}
     return processed_profile
 
 

From ae7ffdf577693e9f968302e46fb1085c48ff3f66 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 12:03:37 +0300
Subject: [PATCH 19/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 40 ++++++++++++++----------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index d6c888daf..4733ea691 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -8,26 +8,22 @@
 
 def tokenize(text: str) -> list[str] | None:
     """
-        Splits a text into tokens, converts the tokens into lowercase,
-        removes punctuation, digits and other symbols
-        :param text: a text
-        :return: a list of lower-cased tokens without punctuation
-        """
+    Splits a text into tokens, converts the tokens into lowercase,
+    removes punctuation, digits and other symbols
+    :param text: a text
+    :return: a list of lower-cased tokens without punctuation
+    """
     if not isinstance(text, str):
         return None
-    cleaned_text = []
-    for symbol in text:
-        if symbol.isalpha():
-            cleaned_text.append(symbol.lower())
-    return cleaned_text
+    return [symbol.lower() for symbol in text if symbol.isalpha()]
 
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """
-       Calculates frequencies of given tokens
-       :param tokens: a list of tokens
-       :return: a dictionary with frequencies
-       """
+    Calculates frequencies of given tokens
+    :param tokens: a list of tokens
+    :return: a dictionary with frequencies
+    """
     if not isinstance(tokens, list):
         return None
     for token in tokens:
@@ -46,11 +42,11 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
 
 def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
     """
-        Creates a language profile
-        :param language: a language
-        :param text: a text
-        :return: a dictionary with two keys – name, freq
-        """
+    Creates a language profile
+    :param language: a language
+    :param text: a text
+    :return: a dictionary with two keys – name, freq
+    """
     if not isinstance(language, str) or not isinstance(text, str):
         return None
     values_freq = calculate_frequencies(tokenize(text))
@@ -70,8 +66,10 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
             len(actual) != len(predicted)):
         return None
     summ_values = 0
-    for i, predicted_value in enumerate(predicted):
-        summ_values += (actual[i] - predicted_value)**2
+    squared_difference = ([(actual_value - predicted_value)**2
+                           for actual_value, predicted_value in zip(actual, predicted)])
+    for value in squared_difference:
+        summ_values += value
     mse = round(summ_values / len(actual), 4)
     return mse
 

From b7516ad8d5235f5aa4c771552d84957157d12ccb Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 12:09:31 +0300
Subject: [PATCH 20/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 4733ea691..e2bcc051a 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -154,7 +154,7 @@ def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
             new_freq[key.lower()] += value / profile["n_words"][0]
         elif len(key) == 1:
             new_freq[key.lower()] = value / profile["n_words"][0]
-    processed_profile = {'name': profile.get("name"), 'freq': new_freq}
+    processed_profile = {'name': profile["name"], 'freq': new_freq}
     return processed_profile
 
 

From 0fdf5bd8545611abb0304af5bbb3abab6084de25 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 12:48:41 +0300
Subject: [PATCH 21/81] added fixed

---
 lab_1_classify_by_unigrams/main.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index e2bcc051a..3762b16bf 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -31,10 +31,9 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
             return None
     freqs = {}
     for token in tokens:
-        if token in freqs:
-            freqs[token] += 1
-        else:
-            freqs[token] = 1
+        if token not in freqs:
+            freqs[token] = 0
+        freqs[token] += 1
     for token, freq in freqs.items():
         freqs[token] = freq / len(tokens)
     return freqs
@@ -168,12 +167,11 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str,
         return None
     list_processed_profiles = []
     for paths in paths_to_profiles:
-        if isinstance(paths, str):
-            language_profile = load_profile(paths)
-            if isinstance(language_profile, dict):
-                processed_profile = preprocess_profile(language_profile)
-                if isinstance(processed_profile, dict):
-                    list_processed_profiles.append(processed_profile)
+        language_profile = load_profile(paths)
+        if isinstance(language_profile, dict):
+            processed_profile = preprocess_profile(language_profile)
+        if isinstance(processed_profile, dict):
+            list_processed_profiles.append(processed_profile)
     return list_processed_profiles
 
 

From fd4ce28ad2fb8abdb9a62426b7aee7f3dd9e6396 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 13:08:06 +0300
Subject: [PATCH 22/81] added fixed

---
 lab_1_classify_by_unigrams/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 3762b16bf..0fadc37ca 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -31,9 +31,10 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
             return None
     freqs = {}
     for token in tokens:
-        if token not in freqs:
-            freqs[token] = 0
-        freqs[token] += 1
+        if token in freqs:
+            freqs[token] += 1
+        else:
+            freqs[token] = 1
     for token, freq in freqs.items():
         freqs[token] = freq / len(tokens)
     return freqs

From 3d3664d22acce3fd206148b247eb3e66b387cc65 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 13:16:07 +0300
Subject: [PATCH 23/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 0fadc37ca..2f8fa9fe2 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -171,8 +171,8 @@ def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str,
         language_profile = load_profile(paths)
         if isinstance(language_profile, dict):
             processed_profile = preprocess_profile(language_profile)
-        if isinstance(processed_profile, dict):
-            list_processed_profiles.append(processed_profile)
+            if isinstance(processed_profile, dict):
+                list_processed_profiles.append(processed_profile)
     return list_processed_profiles
 
 

From ab3519f67384b2c07b3bbe863da8fa90dc4922c2 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 23:27:17 +0300
Subject: [PATCH 24/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 2f8fa9fe2..7cb26a06d 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -2,7 +2,6 @@
 Lab 1
 Language detection
 """
-
 import json
 
 
@@ -70,7 +69,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
                            for actual_value, predicted_value in zip(actual, predicted)])
     for value in squared_difference:
         summ_values += value
-    mse = round(summ_values / len(actual), 4)
+    mse = summ_values / len(actual)
     return mse
 
 
@@ -114,13 +113,14 @@ def detect_language(
         return None
     mse_profile_1 = compare_profiles(unknown_profile, profile_1)
     mse_profile_2 = compare_profiles(unknown_profile, profile_2)
-    if (isinstance(mse_profile_1, float)
-            and isinstance(mse_profile_2, float)):
-        if mse_profile_1 < mse_profile_2:
-            return str(profile_1['name'])
-        if mse_profile_2 < mse_profile_1:
-            return str(profile_2['name'])
-    return sorted([str(profile_1['name']), str(profile_2['name'])])[0]
+    if (not isinstance(mse_profile_1, float)
+            or not isinstance(mse_profile_2, float)):
+        return None
+    if mse_profile_1 < mse_profile_2:
+        return str(profile_1['name'])
+    if mse_profile_2 < mse_profile_1:
+        return str(profile_2['name'])
+    return sorted([profile_1['name'], profile_2['name']])[0]
 
 
 def load_profile(path_to_file: str) -> dict | None:

From 0ce585b9b45e699eb9a7118ec37b7cb504b4140d Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 23:32:17 +0300
Subject: [PATCH 25/81] added fixes

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 7cb26a06d..1680de454 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -120,7 +120,7 @@ def detect_language(
         return str(profile_1['name'])
     if mse_profile_2 < mse_profile_1:
         return str(profile_2['name'])
-    return sorted([profile_1['name'], profile_2['name']])[0]
+    return sorted([str(profile_1['name']), str(profile_2['name'])])[0]
 
 
 def load_profile(path_to_file: str) -> dict | None:

From 516127cbc7060414ba11103903cdcdcebd04c3a5 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 5 Oct 2023 23:33:27 +0300
Subject: [PATCH 26/81] start

---
 lab_1_classify_by_unigrams/start.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index 9e9930ece..aee582de4 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -1,26 +1,27 @@
 """
 Language detection starter
 """
-from lab_1_classify_by_unigrams.main import create_language_profile, detect_language
+from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile,
+                                             detect_language_advanced, print_report)
 
 
 def main() -> None:
     """
     Launches an implementation
     """
-    with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
-        en_text = file_to_read_en.read()
-    with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
-        de_text = file_to_read_de.read()
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
-    en_profile = create_language_profile('en', en_text)
-    de_profile = create_language_profile('de', de_text)
     unknown_profile = create_language_profile('unknown', unknown_text)
-    if (isinstance(en_profile, dict) and isinstance(de_profile, dict) and
-            isinstance(unknown_profile, dict)):
-        result = detect_language(unknown_profile, en_profile, de_profile)
-        assert result, "Detection result is None"
+    language_profiles = ['assets/profiles/es.json', 'assets/profiles/de.json',
+                         'assets/profiles/en.json', 'assets/profiles/fr.json',
+                         'assets/profiles/it.json', 'assets/profiles/ru.json',
+                         'assets/profiles/tr.json']
+    profiles = collect_profiles(language_profiles)
+    if isinstance(unknown_profile, dict) and isinstance(profiles, list):
+        result = detect_language_advanced(unknown_profile, profiles)
+        if isinstance(result, list):
+            print_report(result)
+            assert result, "Detection result is None"
 
 
 if __name__ == "__main__":

From 9de3541d65dcc654e0d5305f6f8f6aa888be2eb6 Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Wed, 11 Oct 2023 11:02:56 +0300
Subject: [PATCH 27/81] checkout labs from the origin repository

---
 lab_1_classify_by_unigrams/start.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index 3531c7385..4a17442d0 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -1,8 +1,6 @@
 """
 Language detection starter
 """
-from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile,
-                                             detect_language_advanced, print_report)
 
 from lab_1_classify_by_unigrams.main import (collect_profiles, create_language_profile,
                                              detect_language_advanced, print_report)
@@ -12,6 +10,10 @@ def main() -> None:
     """
     Launches an implementation
     """
+    with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
+        en_text = file_to_read_en.read()
+    with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
+        de_text = file_to_read_de.read()
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
 

From 813de00b8ed583d6faab8549c3f12c6ee19e9e0a Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 12 Oct 2023 20:56:22 +0300
Subject: [PATCH 28/81] lab2

---
 lab_1_classify_by_unigrams/main.py | 204 -----------------------------
 1 file changed, 204 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 1680de454..e69de29bb 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -1,204 +0,0 @@
-"""
-Lab 1
-Language detection
-"""
-import json
-
-
-def tokenize(text: str) -> list[str] | None:
-    """
-    Splits a text into tokens, converts the tokens into lowercase,
-    removes punctuation, digits and other symbols
-    :param text: a text
-    :return: a list of lower-cased tokens without punctuation
-    """
-    if not isinstance(text, str):
-        return None
-    return [symbol.lower() for symbol in text if symbol.isalpha()]
-
-
-def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
-    """
-    Calculates frequencies of given tokens
-    :param tokens: a list of tokens
-    :return: a dictionary with frequencies
-    """
-    if not isinstance(tokens, list):
-        return None
-    for token in tokens:
-        if not isinstance(token, str):
-            return None
-    freqs = {}
-    for token in tokens:
-        if token in freqs:
-            freqs[token] += 1
-        else:
-            freqs[token] = 1
-    for token, freq in freqs.items():
-        freqs[token] = freq / len(tokens)
-    return freqs
-
-
-def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
-    """
-    Creates a language profile
-    :param language: a language
-    :param text: a text
-    :return: a dictionary with two keys – name, freq
-    """
-    if not isinstance(language, str) or not isinstance(text, str):
-        return None
-    values_freq = calculate_frequencies(tokenize(text))
-    if not isinstance(values_freq, dict):
-        return None
-    return {'name': language, 'freq': values_freq}
-
-
-def calculate_mse(predicted: list, actual: list) -> float | None:
-    """
-    Calculates mean squared error between predicted and actual values
-    :param predicted: a list of predicted values
-    :param actual: a list of actual values
-    :return: the score
-    """
-    if (not isinstance(actual, list) or not isinstance(predicted, list) or
-            len(actual) != len(predicted)):
-        return None
-    summ_values = 0
-    squared_difference = ([(actual_value - predicted_value)**2
-                           for actual_value, predicted_value in zip(actual, predicted)])
-    for value in squared_difference:
-        summ_values += value
-    mse = summ_values / len(actual)
-    return mse
-
-
-def compare_profiles(
-        unknown_profile: dict[str, str | dict[str, float]],
-        profile_to_compare: dict[str, str | dict[str, float]]
-) -> float | None:
-    """
-    Compares profiles and calculates the distance using symbols
-    :param unknown_profile: a dictionary of an unknown profile
-    :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
-    :return: the distance between the profiles
-    """
-    if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or
-            'name' not in unknown_profile or 'name' not in profile_to_compare):
-        return None
-    tokens = set(profile_to_compare['freq'].keys())
-    tokens.update(unknown_profile['freq'].keys())
-    list_unknown_profile = []
-    list_profile_to_compare = []
-    for letter in tokens:
-        list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0))
-        list_unknown_profile.append(unknown_profile['freq'].get(letter, 0))
-    return calculate_mse(list_profile_to_compare, list_unknown_profile)
-
-
-def detect_language(
-        unknown_profile: dict[str, str | dict[str, float]],
-        profile_1: dict[str, str | dict[str, float]],
-        profile_2: dict[str, str | dict[str, float]],
-) -> str | None:
-    """
-    Detects the language of an unknown profile
-    :param unknown_profile: a dictionary of a profile to determine the language of
-    :param profile_1: a dictionary of a known profile
-    :param profile_2: a dictionary of a known profile
-    :return: a language
-    """
-    if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or
-            not isinstance(profile_2, dict)):
-        return None
-    mse_profile_1 = compare_profiles(unknown_profile, profile_1)
-    mse_profile_2 = compare_profiles(unknown_profile, profile_2)
-    if (not isinstance(mse_profile_1, float)
-            or not isinstance(mse_profile_2, float)):
-        return None
-    if mse_profile_1 < mse_profile_2:
-        return str(profile_1['name'])
-    if mse_profile_2 < mse_profile_1:
-        return str(profile_2['name'])
-    return sorted([str(profile_1['name']), str(profile_2['name'])])[0]
-
-
-def load_profile(path_to_file: str) -> dict | None:
-    """
-    Loads a language profile
-    :param path_to_file: a path to the language profile
-    :return: a dictionary with at least two keys – name, freq
-    """
-    if not isinstance(path_to_file, str):
-        return None
-    with open(path_to_file, "r", encoding="utf-8") as json_file:
-        language_profile = json.load(json_file)
-    if not isinstance(language_profile, dict):
-        return None
-    return language_profile
-
-
-def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
-    """
-    Preprocesses profile for a loaded language
-    :param profile: a loaded profile
-    :return: a dict with a lower-cased loaded profile
-    with relative frequencies without unnecessary ngrams
-    """
-    if (not isinstance(profile, dict) or 'name' not in profile
-            or 'freq' not in profile or 'n_words' not in profile):
-        return None
-    new_freq = {}
-    for key, value in profile['freq'].items():
-        if key.lower() in new_freq:
-            new_freq[key.lower()] += value / profile["n_words"][0]
-        elif len(key) == 1:
-            new_freq[key.lower()] = value / profile["n_words"][0]
-    processed_profile = {'name': profile["name"], 'freq': new_freq}
-    return processed_profile
-
-
-def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None:
-    """
-    Collects profiles for a given path
-    :paths_to_profiles: a list of strings to the profiles
-    :return: a list of loaded profiles
-    """
-    if not isinstance(paths_to_profiles, list):
-        return None
-    list_processed_profiles = []
-    for paths in paths_to_profiles:
-        language_profile = load_profile(paths)
-        if isinstance(language_profile, dict):
-            processed_profile = preprocess_profile(language_profile)
-            if isinstance(processed_profile, dict):
-                list_processed_profiles.append(processed_profile)
-    return list_processed_profiles
-
-
-def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]],
-                             known_profiles: list) -> list | None:
-    """
-    Detects the language of an unknown profile
-    :param unknown_profile: a dictionary of a profile to determine the language of
-    :param known_profiles: a list of known profiles
-    :return: a sorted list of tuples containing a language and a distance
-    """
-    if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list):
-        return None
-    list_mse = []
-    for profile in known_profiles:
-        if isinstance(profile, dict):
-            list_mse.append((profile['name'], compare_profiles(unknown_profile, profile)))
-    list_mse.sort(key=lambda a: (a[1], a[0]))
-    return list_mse
-
-
-def print_report(detections: list[tuple[str, float]]) -> None:
-    """
-    Prints report for detection of language
-    :param detections: a list with distances for each available language
-    """
-    if isinstance(detections, list):
-        for profile in detections:
-            print(f'{profile[0]}: MSE {profile[1]:.5f}')

From 3af2ea913e70eb7af396e01ffedd9493a2bda220 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 13 Oct 2023 09:31:29 +0300
Subject: [PATCH 29/81] i start do lab

---
 lab_1_classify_by_unigrams/main.py | 204 +++++++++++++++++++++++++++++
 requirements_qa.txt                |  10 +-
 seminars/practice_3_lists.py       |  60 +++++++--
 3 files changed, 256 insertions(+), 18 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index e69de29bb..1680de454 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -0,0 +1,204 @@
+"""
+Lab 1
+Language detection
+"""
+import json
+
+
+def tokenize(text: str) -> list[str] | None:
+    """
+    Splits a text into tokens, converts the tokens into lowercase,
+    removes punctuation, digits and other symbols
+    :param text: a text
+    :return: a list of lower-cased tokens without punctuation
+    """
+    if not isinstance(text, str):
+        return None
+    return [symbol.lower() for symbol in text if symbol.isalpha()]
+
+
+def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
+    """
+    Calculates frequencies of given tokens
+    :param tokens: a list of tokens
+    :return: a dictionary with frequencies
+    """
+    if not isinstance(tokens, list):
+        return None
+    for token in tokens:
+        if not isinstance(token, str):
+            return None
+    freqs = {}
+    for token in tokens:
+        if token in freqs:
+            freqs[token] += 1
+        else:
+            freqs[token] = 1
+    for token, freq in freqs.items():
+        freqs[token] = freq / len(tokens)
+    return freqs
+
+
+def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
+    """
+    Creates a language profile
+    :param language: a language
+    :param text: a text
+    :return: a dictionary with two keys – name, freq
+    """
+    if not isinstance(language, str) or not isinstance(text, str):
+        return None
+    values_freq = calculate_frequencies(tokenize(text))
+    if not isinstance(values_freq, dict):
+        return None
+    return {'name': language, 'freq': values_freq}
+
+
+def calculate_mse(predicted: list, actual: list) -> float | None:
+    """
+    Calculates mean squared error between predicted and actual values
+    :param predicted: a list of predicted values
+    :param actual: a list of actual values
+    :return: the score
+    """
+    if (not isinstance(actual, list) or not isinstance(predicted, list) or
+            len(actual) != len(predicted)):
+        return None
+    summ_values = 0
+    squared_difference = ([(actual_value - predicted_value)**2
+                           for actual_value, predicted_value in zip(actual, predicted)])
+    for value in squared_difference:
+        summ_values += value
+    mse = summ_values / len(actual)
+    return mse
+
+
+def compare_profiles(
+        unknown_profile: dict[str, str | dict[str, float]],
+        profile_to_compare: dict[str, str | dict[str, float]]
+) -> float | None:
+    """
+    Compares profiles and calculates the distance using symbols
+    :param unknown_profile: a dictionary of an unknown profile
+    :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
+    :return: the distance between the profiles
+    """
+    if (not isinstance(unknown_profile, dict) or not isinstance(profile_to_compare, dict) or
+            'name' not in unknown_profile or 'name' not in profile_to_compare):
+        return None
+    tokens = set(profile_to_compare['freq'].keys())
+    tokens.update(unknown_profile['freq'].keys())
+    list_unknown_profile = []
+    list_profile_to_compare = []
+    for letter in tokens:
+        list_profile_to_compare.append(profile_to_compare['freq'].get(letter, 0))
+        list_unknown_profile.append(unknown_profile['freq'].get(letter, 0))
+    return calculate_mse(list_profile_to_compare, list_unknown_profile)
+
+
+def detect_language(
+        unknown_profile: dict[str, str | dict[str, float]],
+        profile_1: dict[str, str | dict[str, float]],
+        profile_2: dict[str, str | dict[str, float]],
+) -> str | None:
+    """
+    Detects the language of an unknown profile
+    :param unknown_profile: a dictionary of a profile to determine the language of
+    :param profile_1: a dictionary of a known profile
+    :param profile_2: a dictionary of a known profile
+    :return: a language
+    """
+    if (not isinstance(unknown_profile, dict) or not isinstance(profile_1, dict) or
+            not isinstance(profile_2, dict)):
+        return None
+    mse_profile_1 = compare_profiles(unknown_profile, profile_1)
+    mse_profile_2 = compare_profiles(unknown_profile, profile_2)
+    if (not isinstance(mse_profile_1, float)
+            or not isinstance(mse_profile_2, float)):
+        return None
+    if mse_profile_1 < mse_profile_2:
+        return str(profile_1['name'])
+    if mse_profile_2 < mse_profile_1:
+        return str(profile_2['name'])
+    return sorted([str(profile_1['name']), str(profile_2['name'])])[0]
+
+
+def load_profile(path_to_file: str) -> dict | None:
+    """
+    Loads a language profile
+    :param path_to_file: a path to the language profile
+    :return: a dictionary with at least two keys – name, freq
+    """
+    if not isinstance(path_to_file, str):
+        return None
+    with open(path_to_file, "r", encoding="utf-8") as json_file:
+        language_profile = json.load(json_file)
+    if not isinstance(language_profile, dict):
+        return None
+    return language_profile
+
+
+def preprocess_profile(profile: dict) -> dict[str, str | dict] | None:
+    """
+    Preprocesses profile for a loaded language
+    :param profile: a loaded profile
+    :return: a dict with a lower-cased loaded profile
+    with relative frequencies without unnecessary ngrams
+    """
+    if (not isinstance(profile, dict) or 'name' not in profile
+            or 'freq' not in profile or 'n_words' not in profile):
+        return None
+    new_freq = {}
+    for key, value in profile['freq'].items():
+        if key.lower() in new_freq:
+            new_freq[key.lower()] += value / profile["n_words"][0]
+        elif len(key) == 1:
+            new_freq[key.lower()] = value / profile["n_words"][0]
+    processed_profile = {'name': profile["name"], 'freq': new_freq}
+    return processed_profile
+
+
+def collect_profiles(paths_to_profiles: list) -> list[dict[str, str | dict[str, float]]] | None:
+    """
+    Collects profiles for a given path
+    :paths_to_profiles: a list of strings to the profiles
+    :return: a list of loaded profiles
+    """
+    if not isinstance(paths_to_profiles, list):
+        return None
+    list_processed_profiles = []
+    for paths in paths_to_profiles:
+        language_profile = load_profile(paths)
+        if isinstance(language_profile, dict):
+            processed_profile = preprocess_profile(language_profile)
+            if isinstance(processed_profile, dict):
+                list_processed_profiles.append(processed_profile)
+    return list_processed_profiles
+
+
+def detect_language_advanced(unknown_profile: dict[str, str | dict[str, float]],
+                             known_profiles: list) -> list | None:
+    """
+    Detects the language of an unknown profile
+    :param unknown_profile: a dictionary of a profile to determine the language of
+    :param known_profiles: a list of known profiles
+    :return: a sorted list of tuples containing a language and a distance
+    """
+    if not isinstance(unknown_profile, dict) or not isinstance(known_profiles, list):
+        return None
+    list_mse = []
+    for profile in known_profiles:
+        if isinstance(profile, dict):
+            list_mse.append((profile['name'], compare_profiles(unknown_profile, profile)))
+    list_mse.sort(key=lambda a: (a[1], a[0]))
+    return list_mse
+
+
+def print_report(detections: list[tuple[str, float]]) -> None:
+    """
+    Prints report for detection of language
+    :param detections: a list with distances for each available language
+    """
+    if isinstance(detections, list):
+        for profile in detections:
+            print(f'{profile[0]}: MSE {profile[1]:.5f}')
diff --git a/requirements_qa.txt b/requirements_qa.txt
index 2c7efb1e5..b165f0d32 100644
--- a/requirements_qa.txt
+++ b/requirements_qa.txt
@@ -1,16 +1,16 @@
 ast-comments==1.0.1
 black==22.6.0
 coverage[toml]==6.4.4
-ghapi==0.1.19
-flake8==6.0.0
 flake8-isort==6.0.0
+flake8==6.0.0
+ghapi==0.1.19
 mypy==1.1.1
-pymarkdownlnt==0.9.9
-pymdown-extensions==9.5
 pydantic==1.10.7
 pylint==2.15.10
+pymarkdownlnt==0.9.9
+pymdown-extensions==9.5
 pyspelling==2.7.3
 pytest==6.2.5
 regex==2023.3.23
-typed-argument-parser==1.8.1
 tqdm==4.64.1
+typed-argument-parser==1.8.1
\ No newline at end of file
diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py
index 915cdb41f..600b9f265 100644
--- a/seminars/practice_3_lists.py
+++ b/seminars/practice_3_lists.py
@@ -58,11 +58,16 @@ def count_evens(nums: list) -> int:
     """
     Return the number of even ints in the given array.
     """
-    # student realization goes here
+    n = 0
+    for element in nums:
+        if element % 2 == 0:
+            n += 1
+    return n
+
 
 
 # Function calls with expected result:
-# count_evens([2, 1, 2, 3, 4]) → 3
+count_evens([2, 1, 2, 3, 4])
 # count_evens([2, 2, 0]) → 3
 # count_evens([1, 3, 5]) → 0
 
@@ -75,12 +80,16 @@ def sum13(nums: list) -> int:
     so it does not count and numbers that come after a 13
     also do not count.
     """
-    # student realization goes here
+    summ = 0
+    for element in nums:
+        if element != 13:
+            summ += element
+    print(summ)
 
 # Function calls with expected result:
 # sum13([1, 2, 2, 1]) → 6
 # sum13([1, 1]) → 2
-# sum13([1, 2, 2, 1, 13]) → 6
+sum13([1, 2, 2, 1, 13])
 # sum13([1, 2, 2, 1, 13, 5, 6]) → 6
 
 
@@ -93,11 +102,11 @@ def sum67(nums: list) -> int:
     (every 6 will be followed by at least one 7).
     Return 0 for no numbers.
     """
-    # student realization goes here
+
 
 # Function calls with expected result:
-# sum67([1, 2, 2]) → 5
-# sum67([1, 2, 2, 6, 99, 99, 7]) → 5
+sum67([1, 2, 2])
+print(sum67([1, 2, 2, 6, 99, 99, 7]))
 # sum67([1, 1, 6, 7, 2]) → 4
 
 
@@ -108,10 +117,13 @@ def create_phone_number(nums: list) -> str:
     Write a function that accepts an array of 10 integers (between 0 and 9),
     that returns a string of those numbers in the form of a phone number.
     """
-    # student realization goes here
+    number = ''.join(str(a) for a in nums)
+    phone = f'({number[:3]} {number[3:6]}-{number[6:]})'
+    return phone
+
 
 # Function calls with expected result:
-# create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
+print(create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0]))
 # => returns "(123) 456-7890"
 
 
@@ -129,10 +141,22 @@ def check_exam(correct_answers: list, student_answers: list) -> int:
     and +0 for each blank answer, represented as an empty string.
     If the score < 0, return 0.
     """
-    # student realization goes here
+    score = 0
+    for i, answer in enumerate(student_answers):
+        if answer == correct_answers[i]:
+            score += 4
+        elif answer != correct_answers[i]:
+            score -= 1
+        elif answer == ' ':
+            score += 0
+    if score < 0:
+        return 0
+    else:
+        return score
+
 
 # Function calls with expected result:
-# check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]) → 6
+print(check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]))
 # check_exam(["a", "a", "c", "b"], ["a", "a", "b",  ""]) → 7
 # check_exam(["a", "a", "b", "c"], ["a", "a", "b", "c"]) → 16
 # check_exam(["b", "c", "b", "a"], ["",  "a", "a", "c"]) → 0
@@ -146,14 +170,24 @@ def who_likes_it(names: list) -> str:
     People can "like" blog posts, pictures or other items.
     We want to create the text that should be displayed next to such an item.
     """
-    # student realization goes here
+    if names == []:
+        return "no one likes this"
+    if len(names) == 1:
+        return f'{names[0]} likes this'
+    elif len(names) == 2:
+        return f'{names[0]} and {names[1]} like this'
+    elif len(names) == 3:
+        return f'{names[0]}, {names[1]} and {names[2]} like this'
+    else:
+        return f'{names[0]}, {names[1]} and {len(names) - 2} others like this'
+
 
 # Function calls with expected result:
 # []                                -->  "no one likes this"
 # ["Peter"]                         -->  "Peter likes this"
 # ["Jacob", "Alex"]                 -->  "Jacob and Alex like this"
 # ["Max", "John", "Mark"]           -->  "Max, John and Mark like this"
-# ["Alex", "Jacob", "Mark", "Max"]  -->  "Alex, Jacob and 2 others like this"
+print(who_likes_it(["Alex", "Jacob", "Mark", "Max"]))
 
 
 # Task 7

From 5186fbc6525ea4ac668b6140e21f72865f08b76d Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 20 Oct 2023 12:52:33 +0300
Subject: [PATCH 30/81] 1 func

---
 lab_2_tokenize_by_bpe/main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 620a4d645..1b9558482 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -14,6 +14,8 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
+    if not isinstance(raw_word, str) or not isinstance(start_of_word, str or None) or not isinstance(end_of_word, str or None):
+        return None
 
 
 def collect_frequencies(

From 3f6df3fd9130cd75122a30465b670cd2d5ad4b6c Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 25 Oct 2023 22:57:11 +0300
Subject: [PATCH 31/81] change for 6

---
 lab_2_tokenize_by_bpe/main.py | 59 +++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 1b9558482..c18626eab 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -14,8 +14,14 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
-    if not isinstance(raw_word, str) or not isinstance(start_of_word, str or None) or not isinstance(end_of_word, str or None):
+    if not isinstance(raw_word, str) or not (isinstance(
+            start_of_word, str) or start_of_word is None) or not (
+            isinstance(end_of_word, str) or end_of_word is None):
         return None
+    tokenized_word = [start_of_word if start_of_word is not None else []]
+    tokenized_word.extend(element for element in raw_word)
+    tokenized_word.extend(end_of_word if end_of_word is not None else [])
+    return tuple(tokenized_word)
 
 
 def collect_frequencies(
@@ -28,6 +34,16 @@ def collect_frequencies(
     :param end_of_word: a token that signifies the end of word
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
+    if not isinstance(text, str) or not isinstance(end_of_word, str) or not (
+            isinstance(start_of_word, str) or start_of_word is None):
+        return None
+    frequencies_dict = {}
+    for word in text.split():
+        tokenized_word = prepare_word(word, start_of_word if start_of_word is not None else [], end_of_word)
+        if tokenized_word is None:
+            return None
+        frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1
+    return frequencies_dict
 
 
 def count_tokens_pairs(
@@ -38,6 +54,14 @@ def count_tokens_pairs(
     :param word_frequencies: dictionary in the form of <preprocessed word: number of occurrences>
     :return: dictionary in the form of <token pair: number of occurrences>
     """
+    if not isinstance(word_frequencies, dict):
+        return None
+    pairs_of_tokens = {}
+    for tokens, count in word_frequencies.items():
+        for i in range(len(tokens) - 1):
+            pair = (tokens[i], tokens[i + 1])
+            pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1
+    return pairs_of_tokens
 
 
 def merge_tokens(
@@ -49,6 +73,19 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
+    if not (isinstance(word_frequencies, dict)
+            and isinstance(pair, tuple)):
+        return None
+    merged_frequencies = {}
+    for preprocessed_word, count in word_frequencies.items():
+        if ''.join(pair) in ''.join(preprocessed_word):
+            preprocessed_word = list(preprocessed_word)
+            index = preprocessed_word.index(pair[0])
+            preprocessed_word[index] = pair[0] + pair[1]
+            preprocessed_word.pop(index + 1)
+            preprocessed_word = tuple(preprocessed_word)
+        merged_frequencies[preprocessed_word] = count
+    return merged_frequencies
 
 
 def train(
@@ -60,8 +97,24 @@ def train(
     :param num_merges: required number of new tokens
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-
-
+    if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int):
+        return None
+    while num_merges > 0:
+        pairs_of_tokens = count_tokens_pairs(word_frequencies)
+        if pairs_of_tokens is None:
+            return None
+        if num_merges > len(pairs_of_tokens):
+            num_merges = len(pairs_of_tokens)
+        sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if frequency ==
+                              max(pairs_of_tokens.values())])
+        sorted_pairs.sort(key=lambda x: (-len(x), x))
+        word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
+        if word_frequencies is None:
+            return None
+        num_merges -= 1
+    return word_frequencies
+
+print(train({('a', 'b'): 3, ('b', 'cd'): 3, ('b', 'ca'): 3}, 2))
 def get_vocabulary(
     word_frequencies: dict[tuple[str, ...], int], unknown_token: str
 ) -> dict[str, int] | None:

From 182aa44d1960f76124d124d58a7b16c7b41849ac Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 25 Oct 2023 22:59:01 +0300
Subject: [PATCH 32/81] score

---
 lab_2_tokenize_by_bpe/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index 573541ac9..1e8b31496 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-0
+6

From 07eb01adba6ad3643f7355562feeb6bcdee160f0 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 25 Oct 2023 23:10:22 +0300
Subject: [PATCH 33/81]  added fixes

---
 lab_2_tokenize_by_bpe/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index c18626eab..f4229b201 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -58,8 +58,8 @@ def count_tokens_pairs(
         return None
     pairs_of_tokens = {}
     for tokens, count in word_frequencies.items():
-        for i in range(len(tokens) - 1):
-            pair = (tokens[i], tokens[i + 1])
+        for index in range(len(tokens) - 1):
+            pair = (tokens[index], tokens[index + 1])
             pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1
     return pairs_of_tokens
 
@@ -114,7 +114,7 @@ def train(
         num_merges -= 1
     return word_frequencies
 
-print(train({('a', 'b'): 3, ('b', 'cd'): 3, ('b', 'ca'): 3}, 2))
+
 def get_vocabulary(
     word_frequencies: dict[tuple[str, ...], int], unknown_token: str
 ) -> dict[str, int] | None:

From bb6b622e00c178e5819839f5a0a0fbf7edd04329 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 26 Oct 2023 13:54:22 +0300
Subject: [PATCH 34/81] change for 8

---
 lab_2_tokenize_by_bpe/main.py | 52 ++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index f4229b201..29b138efc 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -18,9 +18,12 @@ def prepare_word(
             start_of_word, str) or start_of_word is None) or not (
             isinstance(end_of_word, str) or end_of_word is None):
         return None
-    tokenized_word = [start_of_word if start_of_word is not None else []]
+    tokenized_word = []
+    if start_of_word is not None:
+        tokenized_word.append(start_of_word)
     tokenized_word.extend(element for element in raw_word)
-    tokenized_word.extend(end_of_word if end_of_word is not None else [])
+    if end_of_word is not None:
+        tokenized_word.append(end_of_word)
     return tuple(tokenized_word)
 
 
@@ -39,10 +42,13 @@ def collect_frequencies(
         return None
     frequencies_dict = {}
     for word in text.split():
-        tokenized_word = prepare_word(word, start_of_word if start_of_word is not None else [], end_of_word)
-        if tokenized_word is None:
-            return None
-        frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1
+        if start_of_word is not None:
+            tokenized_word = prepare_word(word, start_of_word, end_of_word)
+        if start_of_word is None:
+            tokenized_word = prepare_word(word, None, end_of_word)
+            if tokenized_word is None:
+                return None
+            frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1
     return frequencies_dict
 
 
@@ -57,10 +63,10 @@ def count_tokens_pairs(
     if not isinstance(word_frequencies, dict):
         return None
     pairs_of_tokens = {}
-    for tokens, count in word_frequencies.items():
+    for tokens in word_frequencies:
         for index in range(len(tokens) - 1):
             pair = (tokens[index], tokens[index + 1])
-            pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + 1
+            pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens]
     return pairs_of_tokens
 
 
@@ -105,8 +111,8 @@ def train(
             return None
         if num_merges > len(pairs_of_tokens):
             num_merges = len(pairs_of_tokens)
-        sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if frequency ==
-                              max(pairs_of_tokens.values())])
+        sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items()
+                         if frequency == max(pairs_of_tokens.values())])
         sorted_pairs.sort(key=lambda x: (-len(x), x))
         word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
         if word_frequencies is None:
@@ -124,6 +130,20 @@ def get_vocabulary(
     :param unknown_token: a token to signify an unknown token
     :return: dictionary in the form of <token: identifier>
     """
+    if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str):
+        return None
+    tokens_list = set()
+    dict_token_identifier = {}
+    for tuples in word_frequencies:
+        for token in tuples:
+            tokens_list.add(token)
+            for element in token:
+                tokens_list.add(element)
+    tokens_list.add(unknown_token)
+    sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x))
+    for index, token in enumerate(sorted_tokens):
+        dict_token_identifier[token] = index
+    return dict_token_identifier
 
 
 def decode(
@@ -136,6 +156,18 @@ def decode(
     :param end_of_word_token: an end-of-word token
     :return: decoded sequence
     """
+    if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance(
+            end_of_word_token, str) or end_of_word_token is None):
+        return None
+    decoded_tokens = []
+    for index in encoded_text:
+        for token, token_index in vocabulary.items():
+            if token_index == index and end_of_word_token is not None:
+                decoded_tokens.append(' ' if token == end_of_word_token else token)
+            if vocabulary[token] == index and end_of_word_token is None:
+                decoded_tokens.append('' if token == end_of_word_token else token)
+    decoded_text = ''.join(decoded_tokens)
+    return decoded_text
 
 
 def tokenize_word(

From 146a4e2e26ae2a8b61d418777f65123517a1e2f6 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 26 Oct 2023 13:56:53 +0300
Subject: [PATCH 35/81] score 8

---
 lab_2_tokenize_by_bpe/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index 1e8b31496..45a4fb75d 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-6
+8

From 4e5bc343e4d1f3a8fc96de923e7ae105b5ca4a7e Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sat, 28 Oct 2023 23:32:31 +0300
Subject: [PATCH 36/81] change for 10

---
 lab_2_tokenize_by_bpe/main.py | 112 ++++++++++++++++++++++++++++++----
 1 file changed, 101 insertions(+), 11 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 29b138efc..18172301d 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -2,6 +2,7 @@
 Lab 2
 BPE and machine translation evaluation
 """
+import json
 
 
 def prepare_word(
@@ -85,12 +86,16 @@ def merge_tokens(
     merged_frequencies = {}
     for preprocessed_word, count in word_frequencies.items():
         if ''.join(pair) in ''.join(preprocessed_word):
-            preprocessed_word = list(preprocessed_word)
-            index = preprocessed_word.index(pair[0])
-            preprocessed_word[index] = pair[0] + pair[1]
-            preprocessed_word.pop(index + 1)
-            preprocessed_word = tuple(preprocessed_word)
-        merged_frequencies[preprocessed_word] = count
+            list_word = list(preprocessed_word)
+            for index in range(len(list_word) - 1):
+                if (list_word[index], list_word[index + 1]) == pair:
+                    list_word[index + 1] = pair[0] + pair[1]
+                    list_word[index] = ''
+            if '' in list_word:
+                list_word.remove('')
+            merged_frequencies[tuple(list_word)] = count
+        else:
+            merged_frequencies[preprocessed_word] = count
     return merged_frequencies
 
 
@@ -107,15 +112,15 @@ def train(
         return None
     while num_merges > 0:
         pairs_of_tokens = count_tokens_pairs(word_frequencies)
-        if pairs_of_tokens is None:
+        if not pairs_of_tokens:
             return None
         if num_merges > len(pairs_of_tokens):
             num_merges = len(pairs_of_tokens)
-        sorted_pairs = ([token_pair for token_pair, frequency in pairs_of_tokens.items()
-                         if frequency == max(pairs_of_tokens.values())])
-        sorted_pairs.sort(key=lambda x: (-len(x), x))
+        pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if
+                            frequency == max(pairs_of_tokens.values())])
+        sorted_pairs = sorted([pair for pair in pairs_max_values], key=lambda pair: (-len(str(pair)), pair))
         word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
-        if word_frequencies is None:
+        if not word_frequencies:
             return None
         num_merges -= 1
     return word_frequencies
@@ -181,6 +186,26 @@ def tokenize_word(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
+    if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word)
+            or not isinstance(vocabulary, dict) or not isinstance(
+            end_of_word, (str, type(None))) or not isinstance(unknown_token, str)):
+        return None
+    tokens_identifiers = []
+    i = 0
+    while i < len(word):
+        max_length_token = ''
+        for j in range(len(word), i, -1):
+            current_token = "".join(word[i:j])
+            if current_token in vocabulary and len(current_token) > len(max_length_token):
+                max_length_token = current_token
+        if max_length_token:
+            tokens_identifiers.append(vocabulary[max_length_token])
+            i += len(max_length_token)
+        else:
+            if unknown_token in vocabulary:
+                tokens_identifiers.append(vocabulary[unknown_token])
+            i += 1
+    return tokens_identifiers
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -189,6 +214,13 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
     :param vocab_path: path to the saved vocabulary
     :return: dictionary in the form of <token: identifier>
     """
+    if not isinstance(vocab_path, str):
+        return None
+    with open(vocab_path, 'r', encoding='utf-8') as f:
+        vocabulary = json.load(f)
+    if not isinstance(vocabulary, dict):
+        return None
+    return vocabulary
 
 
 def encode(
@@ -207,6 +239,20 @@ def encode(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
+    if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance(
+            unknown_token, str):
+        return None
+    list_token_identifiers = []
+    text = original_text.split()
+    for word in text:
+        prepared_word = prepare_word(word, start_of_word_token, end_of_word_token)
+        if not prepared_word:
+            return None
+        tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token)
+        if not tokens_id:
+            return None
+        list_token_identifiers.extend(tokens_id)
+    return list_token_identifiers
 
 
 def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
@@ -216,6 +262,12 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
     :param order: required number of elements in a single n-gram
     :return: sequence of n-grams
     """
+    if not isinstance(text, str) or not isinstance(order, int):
+        return None
+    sequence_ngrams = []
+    for index in range(len(text) + 1 - order):
+        sequence_ngrams.append(tuple(text[index:order+index]))
+    return sequence_ngrams
 
 
 def calculate_precision(
@@ -227,6 +279,14 @@ def calculate_precision(
     :param reference: expected sequence of n-grams
     :return: value of Precision metric
     """
+    if not isinstance(actual, list) or not isinstance(reference, list):
+        return None
+    if len(actual) == 0:
+        return 0.0
+    unique_reference = set(reference)
+    identical_tokens = [token for token in unique_reference if token in actual]
+    precision = len(identical_tokens) / len(unique_reference)
+    return precision
 
 
 def geo_mean(precisions: list[float], max_order: int) -> float | None:
@@ -236,6 +296,15 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     :param max_order: maximum length of n-gram considered
     :return: value of geometric mean of Precision metric
     """
+    if not isinstance(precisions, list) or not isinstance(max_order, int):
+        return None
+    if not precisions or max_order <= 0:
+        return None
+    all_precision = 1.0
+    for precision in precisions:
+        all_precision *= precision
+    geometric_mean = all_precision**(1.0 / max_order)
+    return geometric_mean
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -246,3 +315,24 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
+    if not isinstance(actual, str) or not isinstance(reference, str) or not isinstance(max_order, int):
+        return None
+    all_ngrams_actual = []
+    all_ngrams_reference = []
+    for order in range(max_order):
+        ngrams_actual = collect_ngrams(actual, order + 1)
+        ngrams_reference = collect_ngrams(reference, order + 1)
+        if not ngrams_actual or not ngrams_reference:
+            return None
+        all_ngrams_actual.append(ngrams_actual)
+        all_ngrams_reference.append(ngrams_reference)
+    precisions = []
+    for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference):
+        presision = calculate_precision(ngrams_actual, ngrams_reference)
+        if not presision:
+            return None
+        precisions.append(presision)
+    blue_metric = geo_mean(precisions, max_order)
+    if blue_metric is None:
+        return None
+    return blue_metric * 100

From a9de02a2a4fd1938d942cb42cf0f575249ca5821 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sat, 28 Oct 2023 23:34:38 +0300
Subject: [PATCH 37/81] score 10

---
 lab_2_tokenize_by_bpe/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index 45a4fb75d..f599e28b8 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-8
+10

From f6ff2bb4ef757823ab2711acf2d90c20cbe97ce8 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 1 Nov 2023 20:13:34 +0300
Subject: [PATCH 38/81] revert practice

---
 seminars/practice_3_lists.py | 62 ++++++++----------------------------
 1 file changed, 14 insertions(+), 48 deletions(-)

diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py
index 600b9f265..7301764b1 100644
--- a/seminars/practice_3_lists.py
+++ b/seminars/practice_3_lists.py
@@ -58,16 +58,11 @@ def count_evens(nums: list) -> int:
     """
     Return the number of even ints in the given array.
     """
-    n = 0
-    for element in nums:
-        if element % 2 == 0:
-            n += 1
-    return n
-
+    # student realization goes here
 
 
 # Function calls with expected result:
-count_evens([2, 1, 2, 3, 4])
+# count_evens([2, 1, 2, 3, 4]) → 3
 # count_evens([2, 2, 0]) → 3
 # count_evens([1, 3, 5]) → 0
 
@@ -80,16 +75,12 @@ def sum13(nums: list) -> int:
     so it does not count and numbers that come after a 13
     also do not count.
     """
-    summ = 0
-    for element in nums:
-        if element != 13:
-            summ += element
-    print(summ)
+    # student realization goes here
 
 # Function calls with expected result:
 # sum13([1, 2, 2, 1]) → 6
 # sum13([1, 1]) → 2
-sum13([1, 2, 2, 1, 13])
+# sum13([1, 2, 2, 1, 13]) → 6
 # sum13([1, 2, 2, 1, 13, 5, 6]) → 6
 
 
@@ -102,11 +93,11 @@ def sum67(nums: list) -> int:
     (every 6 will be followed by at least one 7).
     Return 0 for no numbers.
     """
-
+    # student realization goes here
 
 # Function calls with expected result:
-sum67([1, 2, 2])
-print(sum67([1, 2, 2, 6, 99, 99, 7]))
+# sum67([1, 2, 2]) → 5
+# sum67([1, 2, 2, 6, 99, 99, 7]) → 5
 # sum67([1, 1, 6, 7, 2]) → 4
 
 
@@ -117,13 +108,10 @@ def create_phone_number(nums: list) -> str:
     Write a function that accepts an array of 10 integers (between 0 and 9),
     that returns a string of those numbers in the form of a phone number.
     """
-    number = ''.join(str(a) for a in nums)
-    phone = f'({number[:3]} {number[3:6]}-{number[6:]})'
-    return phone
-
+    # student realization goes here
 
 # Function calls with expected result:
-print(create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0]))
+# create_phone_number([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
 # => returns "(123) 456-7890"
 
 
@@ -141,22 +129,10 @@ def check_exam(correct_answers: list, student_answers: list) -> int:
     and +0 for each blank answer, represented as an empty string.
     If the score < 0, return 0.
     """
-    score = 0
-    for i, answer in enumerate(student_answers):
-        if answer == correct_answers[i]:
-            score += 4
-        elif answer != correct_answers[i]:
-            score -= 1
-        elif answer == ' ':
-            score += 0
-    if score < 0:
-        return 0
-    else:
-        return score
-
+    # student realization goes here
 
 # Function calls with expected result:
-print(check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]))
+# check_exam(["a", "a", "b", "b"], ["a", "c", "b", "d"]) → 6
 # check_exam(["a", "a", "c", "b"], ["a", "a", "b",  ""]) → 7
 # check_exam(["a", "a", "b", "c"], ["a", "a", "b", "c"]) → 16
 # check_exam(["b", "c", "b", "a"], ["",  "a", "a", "c"]) → 0
@@ -170,24 +146,14 @@ def who_likes_it(names: list) -> str:
     People can "like" blog posts, pictures or other items.
     We want to create the text that should be displayed next to such an item.
     """
-    if names == []:
-        return "no one likes this"
-    if len(names) == 1:
-        return f'{names[0]} likes this'
-    elif len(names) == 2:
-        return f'{names[0]} and {names[1]} like this'
-    elif len(names) == 3:
-        return f'{names[0]}, {names[1]} and {names[2]} like this'
-    else:
-        return f'{names[0]}, {names[1]} and {len(names) - 2} others like this'
-
+    # student realization goes here
 
 # Function calls with expected result:
 # []                                -->  "no one likes this"
 # ["Peter"]                         -->  "Peter likes this"
 # ["Jacob", "Alex"]                 -->  "Jacob and Alex like this"
 # ["Max", "John", "Mark"]           -->  "Max, John and Mark like this"
-print(who_likes_it(["Alex", "Jacob", "Mark", "Max"]))
+# ["Alex", "Jacob", "Mark", "Max"]  -->  "Alex, Jacob and 2 others like this"
 
 
 # Task 7
@@ -222,4 +188,4 @@ def scramble(words: list) -> bool:
 # Function calls with expected result:
 # scramble(['rkqodlw', 'world']) ==> True
 # scramble(['cedewaraaossoqqyt', 'codewars']) ==> True
-# scramble(['katas', 'steak']) ==> False
+# scramble(['katas', 'steak']) ==> False
\ No newline at end of file

From 006ec4f89cc7ed3ec768bd623674b312bf8fb938 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 1 Nov 2023 20:17:43 +0300
Subject: [PATCH 39/81] revert practice

---
 seminars/practice_3_lists.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seminars/practice_3_lists.py b/seminars/practice_3_lists.py
index 7301764b1..915cdb41f 100644
--- a/seminars/practice_3_lists.py
+++ b/seminars/practice_3_lists.py
@@ -188,4 +188,4 @@ def scramble(words: list) -> bool:
 # Function calls with expected result:
 # scramble(['rkqodlw', 'world']) ==> True
 # scramble(['cedewaraaossoqqyt', 'codewars']) ==> True
-# scramble(['katas', 'steak']) ==> False
\ No newline at end of file
+# scramble(['katas', 'steak']) ==> False

From f2d0f46f5af0f001cd11951ce9aeebe758e9c2a5 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 1 Nov 2023 21:17:28 +0300
Subject: [PATCH 40/81] changes for checks

---
 lab_2_tokenize_by_bpe/main.py | 45 +++++++++++++++--------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 18172301d..72b67cbf7 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -20,10 +20,10 @@ def prepare_word(
             isinstance(end_of_word, str) or end_of_word is None):
         return None
     tokenized_word = []
-    if start_of_word is not None:
+    if start_of_word:
         tokenized_word.append(start_of_word)
-    tokenized_word.extend(element for element in raw_word)
-    if end_of_word is not None:
+    tokenized_word.extend(raw_word)
+    if end_of_word:
         tokenized_word.append(end_of_word)
     return tuple(tokenized_word)
 
@@ -93,9 +93,8 @@ def merge_tokens(
                     list_word[index] = ''
             if '' in list_word:
                 list_word.remove('')
-            merged_frequencies[tuple(list_word)] = count
-        else:
-            merged_frequencies[preprocessed_word] = count
+            preprocessed_word = tuple(list_word)
+        merged_frequencies[preprocessed_word] = count
     return merged_frequencies
 
 
@@ -118,7 +117,8 @@ def train(
             num_merges = len(pairs_of_tokens)
         pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if
                             frequency == max(pairs_of_tokens.values())])
-        sorted_pairs = sorted([pair for pair in pairs_max_values], key=lambda pair: (-len(str(pair)), pair))
+        sorted_pairs = (sorted(pairs_max_values,
+                               key=lambda pair: (-len(str(pair)), pair)))
         word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
         if not word_frequencies:
             return None
@@ -190,22 +190,15 @@ def tokenize_word(
             or not isinstance(vocabulary, dict) or not isinstance(
             end_of_word, (str, type(None))) or not isinstance(unknown_token, str)):
         return None
-    tokens_identifiers = []
-    i = 0
-    while i < len(word):
-        max_length_token = ''
-        for j in range(len(word), i, -1):
-            current_token = "".join(word[i:j])
-            if current_token in vocabulary and len(current_token) > len(max_length_token):
-                max_length_token = current_token
-        if max_length_token:
-            tokens_identifiers.append(vocabulary[max_length_token])
-            i += len(max_length_token)
-        else:
-            if unknown_token in vocabulary:
-                tokens_identifiers.append(vocabulary[unknown_token])
-            i += 1
-    return tokens_identifiers
+    word_str = ''.join(word)
+    sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
+    for token in sorted_tokens:
+        if token in ''.join(word):
+            word_str = word_str.replace(token, str(vocabulary[token]) + ' ')
+    for symbol in ''.join(word):
+        if symbol not in sorted_tokens:
+            word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ')
+    return [int(identifier) for identifier in word_str.split()]
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -303,8 +296,7 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     all_precision = 1.0
     for precision in precisions:
         all_precision *= precision
-    geometric_mean = all_precision**(1.0 / max_order)
-    return geometric_mean
+    return float(all_precision**(1.0 / max_order))
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -315,7 +307,8 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
-    if not isinstance(actual, str) or not isinstance(reference, str) or not isinstance(max_order, int):
+    if (not isinstance(actual, str) or not isinstance(reference, str)
+            or not isinstance(max_order, int)):
         return None
     all_ngrams_actual = []
     all_ngrams_reference = []

From 64399ca341154e55e9bab46d0a38d981c576df97 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 2 Nov 2023 16:46:45 +0300
Subject: [PATCH 41/81] start

---
 lab_2_tokenize_by_bpe/start.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 798e957e0..0765fb019 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -1,7 +1,9 @@
 """
 BPE Tokenizer starter
 """
+import json
 from pathlib import Path
+from lab_2_tokenize_by_bpe.main import (calculate_bleu, decode, encode)
 
 
 def main() -> None:
@@ -11,9 +13,25 @@ def main() -> None:
     assets_path = Path(__file__).parent / 'assets'
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
+    with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file:
+        vocabulary = json.load(file)
+    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
+        ru_raw = file.read()
+    with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file:
+        ru_encoded = file.read()
 
-    result = None
-    assert result, "Encoding is not working"
+    encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '<unk>')
+    correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())]
+    print(f"Файл закодирован правильно на {(len(correct_tokens) / len(encode_pred)*100)}%")
+
+    with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
+        encoded_en = file.read()
+    with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file:
+        en_raw = file.read()
+
+    decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None)
+    decoded_text = decoded_text.replace('\u2581', ' ')
+    print(f'BLUE = {calculate_bleu(decoded_text, en_raw)}')
 
 
 if __name__ == "__main__":

From 26645768e9c1834f8f1619321fec402d401afa15 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 2 Nov 2023 17:06:17 +0300
Subject: [PATCH 42/81] start

---
 lab_2_tokenize_by_bpe/start.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 0765fb019..8da5b2473 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -3,7 +3,8 @@
 """
 import json
 from pathlib import Path
-from lab_2_tokenize_by_bpe.main import (calculate_bleu, decode, encode)
+
+from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode
 
 
 def main() -> None:
@@ -22,7 +23,8 @@ def main() -> None:
 
     encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '<unk>')
     correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())]
-    print(f"Файл закодирован правильно на {(len(correct_tokens) / len(encode_pred)*100)}%")
+    if correct_tokens:
+        print(f"Файл закодирован правильно на {(len(list(correct_tokens)) / len(encode_pred)*100)}%")
 
     with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
         encoded_en = file.read()

From de9a0c69d8c78f97f0ac70385e04bc4a2d731f1f Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 2 Nov 2023 17:14:32 +0300
Subject: [PATCH 43/81] start

---
 lab_2_tokenize_by_bpe/start.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 8da5b2473..1cf85801d 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -23,8 +23,9 @@ def main() -> None:
 
     encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '<unk>')
     correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())]
-    if correct_tokens:
-        print(f"Файл закодирован правильно на {(len(list(correct_tokens)) / len(encode_pred)*100)}%")
+    if correct_tokens and encode_pred:
+        print((f"Файл закодирован правильно на "
+               f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%"))
 
     with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
         encoded_en = file.read()
@@ -33,7 +34,9 @@ def main() -> None:
 
     decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None)
     decoded_text = decoded_text.replace('\u2581', ' ')
-    print(f'BLUE = {calculate_bleu(decoded_text, en_raw)}')
+    result = calculate_bleu(decoded_text, en_raw)
+    print(f'BLUE = {result}')
+    assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":

From 2b5df06b7bafba7fe577306ba23703dc9cab94db Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 2 Nov 2023 17:33:04 +0300
Subject: [PATCH 44/81] add fixes

---
 lab_2_tokenize_by_bpe/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 72b67cbf7..99764e3b5 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -89,7 +89,7 @@ def merge_tokens(
             list_word = list(preprocessed_word)
             for index in range(len(list_word) - 1):
                 if (list_word[index], list_word[index + 1]) == pair:
-                    list_word[index + 1] = pair[0] + pair[1]
+                    list_word[index + 1] = ''.join(pair)
                     list_word[index] = ''
             if '' in list_word:
                 list_word.remove('')
@@ -143,7 +143,7 @@ def get_vocabulary(
         for token in tuples:
             tokens_list.add(token)
             for element in token:
-                tokens_list.add(element)
+                tokens_list.update(element)
     tokens_list.add(unknown_token)
     sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x))
     for index, token in enumerate(sorted_tokens):

From fdada4585b0170b0ac050532bd88b66210316fa1 Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:15:53 +0300
Subject: [PATCH 45/81] checkout labs from the origin repository

---
 lab_2_tokenize_by_bpe/main.py  | 308 +++++++++++++--------------------
 lab_2_tokenize_by_bpe/start.py |  39 ++---
 2 files changed, 140 insertions(+), 207 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 19a72913f..99764e3b5 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -3,7 +3,6 @@
 BPE and machine translation evaluation
 """
 import json
-import math
 
 
 def prepare_word(
@@ -20,12 +19,13 @@ def prepare_word(
             start_of_word, str) or start_of_word is None) or not (
             isinstance(end_of_word, str) or end_of_word is None):
         return None
-    list_of_tokens = list(raw_word)
-    if end_of_word:
-        list_of_tokens.append(end_of_word)
+    tokenized_word = []
     if start_of_word:
-        list_of_tokens.insert(0, start_of_word)
-    return tuple(list_of_tokens)
+        tokenized_word.append(start_of_word)
+    tokenized_word.extend(raw_word)
+    if end_of_word:
+        tokenized_word.append(end_of_word)
+    return tuple(tokenized_word)
 
 
 def collect_frequencies(
@@ -41,17 +41,16 @@ def collect_frequencies(
     if not isinstance(text, str) or not isinstance(end_of_word, str) or not (
             isinstance(start_of_word, str) or start_of_word is None):
         return None
-
-    dict_frequencies = {}
-
-    splitted_text = text.split()
-    for i in set(splitted_text):
-        word = prepare_word(i, start_of_word, end_of_word)
-        if not word:
-            return None
-        dict_frequencies[word] = splitted_text.count(i)
-
-    return dict_frequencies
+    frequencies_dict = {}
+    for word in text.split():
+        if start_of_word is not None:
+            tokenized_word = prepare_word(word, start_of_word, end_of_word)
+        if start_of_word is None:
+            tokenized_word = prepare_word(word, None, end_of_word)
+            if tokenized_word is None:
+                return None
+            frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1
+    return frequencies_dict
 
 
 def count_tokens_pairs(
@@ -64,17 +63,12 @@ def count_tokens_pairs(
     """
     if not isinstance(word_frequencies, dict):
         return None
-
-    dict_with_pairs = {}
-
-    for word in word_frequencies:
-        for index in range(len(word) - 1):
-            pair = (word[index], word[index + 1])
-            if pair not in dict_with_pairs:
-                dict_with_pairs[pair] = 0
-            dict_with_pairs[pair] += word_frequencies[word]
-
-    return dict_with_pairs
+    pairs_of_tokens = {}
+    for tokens in word_frequencies:
+        for index in range(len(tokens) - 1):
+            pair = (tokens[index], tokens[index + 1])
+            pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens]
+    return pairs_of_tokens
 
 
 def merge_tokens(
@@ -86,24 +80,22 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple):
+    if not (isinstance(word_frequencies, dict)
+            and isinstance(pair, tuple)):
         return None
-    dict_merged_tokens = {}
-    for i in word_frequencies:
-        list_word = list(i)
-
-        for index in range(len(list_word) - 1):
-            if (i[index], i[index + 1]) == pair:
-                list_word[index + 1] = pair[0] + pair[1]
-                list_word[index] = ''
-
-        if '' in list_word:
-            list_word.remove('')
-            dict_merged_tokens.update({tuple(list_word): word_frequencies[i]})
-        else:
-            dict_merged_tokens.update({i: word_frequencies[i]})
-
-    return dict_merged_tokens
+    merged_frequencies = {}
+    for preprocessed_word, count in word_frequencies.items():
+        if ''.join(pair) in ''.join(preprocessed_word):
+            list_word = list(preprocessed_word)
+            for index in range(len(list_word) - 1):
+                if (list_word[index], list_word[index + 1]) == pair:
+                    list_word[index + 1] = ''.join(pair)
+                    list_word[index] = ''
+            if '' in list_word:
+                list_word.remove('')
+            preprocessed_word = tuple(list_word)
+        merged_frequencies[preprocessed_word] = count
+    return merged_frequencies
 
 
 def train(
@@ -117,31 +109,20 @@ def train(
     """
     if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int):
         return None
-    dict_with_pairs = count_tokens_pairs(word_frequencies)
-
-    if not dict_with_pairs:
-        return None
-    merges = min(num_merges, len(dict_with_pairs))
-
-    for i in range(merges):
-
-        max_values = max(dict_with_pairs.values())
-        pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values]
-
-        max_len = max(len(str(pair)) for pair in pairs_max_values)
-        pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len]
-
-        sorted_pairs = sorted(pairs_max_len)
+    while num_merges > 0:
+        pairs_of_tokens = count_tokens_pairs(word_frequencies)
+        if not pairs_of_tokens:
+            return None
+        if num_merges > len(pairs_of_tokens):
+            num_merges = len(pairs_of_tokens)
+        pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if
+                            frequency == max(pairs_of_tokens.values())])
+        sorted_pairs = (sorted(pairs_max_values,
+                               key=lambda pair: (-len(str(pair)), pair)))
         word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
-
         if not word_frequencies:
             return None
-
-        dict_with_pairs = count_tokens_pairs(word_frequencies)
-
-        if not dict_with_pairs:
-            return None
-
+        num_merges -= 1
     return word_frequencies
 
 
@@ -156,24 +137,18 @@ def get_vocabulary(
     """
     if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str):
         return None
-
-    dict_ident = {}
-    unique_tokens = set()
-
-    for tuple_tokens in word_frequencies.keys():
-        for word in tuple_tokens:
-            unique_tokens.update(tuple_tokens, word)
-
-    unique_tokens.add(unknown_token)
-    lex_sorted = sorted(unique_tokens)
-    len_sorted = sorted(lex_sorted, key=len, reverse=True)
-    index = 0
-
-    for token in len_sorted:
-        dict_ident[token] = index
-        index += 1
-
-    return dict_ident
+    tokens_list = set()
+    dict_token_identifier = {}
+    for tuples in word_frequencies:
+        for token in tuples:
+            tokens_list.add(token)
+            for element in token:
+                tokens_list.update(element)
+    tokens_list.add(unknown_token)
+    sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x))
+    for index, token in enumerate(sorted_tokens):
+        dict_token_identifier[token] = index
+    return dict_token_identifier
 
 
 def decode(
@@ -189,17 +164,15 @@ def decode(
     if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance(
             end_of_word_token, str) or end_of_word_token is None):
         return None
-    decoded = ''
-    for identifier in encoded_text:
-        token_list = [key for key in vocabulary if vocabulary[key] == identifier]
-
-        for token in token_list:
-            decoded += token
-
-    if end_of_word_token:
-        decoded = decoded.replace(end_of_word_token, ' ')
-
-    return decoded
+    decoded_tokens = []
+    for index in encoded_text:
+        for token, token_index in vocabulary.items():
+            if token_index == index and end_of_word_token is not None:
+                decoded_tokens.append(' ' if token == end_of_word_token else token)
+            if vocabulary[token] == index and end_of_word_token is None:
+                decoded_tokens.append('' if token == end_of_word_token else token)
+    decoded_text = ''.join(decoded_tokens)
+    return decoded_text
 
 
 def tokenize_word(
@@ -213,27 +186,19 @@ def tokenize_word(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance(
-            end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str):
+    if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word)
+            or not isinstance(vocabulary, dict) or not isinstance(
+            end_of_word, (str, type(None))) or not isinstance(unknown_token, str)):
         return None
-
-    word_copy = ''.join(word)
-    sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
-    result = []
-
-    for key in sorted_vocabulary:
-        while key in word_copy:
-            index = word_copy.count(' ', 0, word_copy.find(key))
-            result.insert(index, vocabulary[key])
-            word_copy = word_copy.replace(key, ' ', 1)
-
-    for unk in word_copy:
-        if unk != ' ':
-            index = word_copy.find(unk)
-            word_copy = word_copy.replace(unk, ' ')
-            result.insert(index, vocabulary[unknown_token])
-
-    return result
+    word_str = ''.join(word)
+    sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
+    for token in sorted_tokens:
+        if token in ''.join(word):
+            word_str = word_str.replace(token, str(vocabulary[token]) + ' ')
+    for symbol in ''.join(word):
+        if symbol not in sorted_tokens:
+            word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ')
+    return [int(identifier) for identifier in word_str.split()]
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -244,14 +209,11 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
     """
     if not isinstance(vocab_path, str):
         return None
-
     with open(vocab_path, 'r', encoding='utf-8') as f:
-        vocab = json.load(f)
-
-    if not isinstance(vocab, dict):
+        vocabulary = json.load(f)
+    if not isinstance(vocabulary, dict):
         return None
-
-    return vocab
+    return vocabulary
 
 
 def encode(
@@ -270,26 +232,20 @@ def encode(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if not isinstance(original_text, str) or not isinstance(
-            vocabulary, dict) or not (isinstance(
-            start_of_word_token, str) or start_of_word_token is None) or not (isinstance(
-            end_of_word_token, str) or end_of_word_token is None) or not isinstance(
+    if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance(
             unknown_token, str):
         return None
-
-    encoded = []
-    split_text = original_text.split()
-
-    for word in split_text:
-        prepared = prepare_word(word, start_of_word_token, end_of_word_token)
-        if not prepared:
+    list_token_identifiers = []
+    text = original_text.split()
+    for word in text:
+        prepared_word = prepare_word(word, start_of_word_token, end_of_word_token)
+        if not prepared_word:
             return None
-        result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token)
-        if not result:
+        tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token)
+        if not tokens_id:
             return None
-        encoded.extend(result)
-
-    return encoded
+        list_token_identifiers.extend(tokens_id)
+    return list_token_identifiers
 
 
 def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
@@ -301,12 +257,10 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
     """
     if not isinstance(text, str) or not isinstance(order, int):
         return None
-
-    n_grams = []
+    sequence_ngrams = []
     for index in range(len(text) + 1 - order):
-        n_grams.append(tuple(text[index: index + order]))
-
-    return n_grams
+        sequence_ngrams.append(tuple(text[index:order+index]))
+    return sequence_ngrams
 
 
 def calculate_precision(
@@ -320,15 +274,12 @@ def calculate_precision(
     """
     if not isinstance(actual, list) or not isinstance(reference, list):
         return None
-
-    unique_ngrams = set(reference)
-    matches = 0
-
-    for n_gram in unique_ngrams:
-        if n_gram in actual:
-            matches += 1
-
-    return matches / len(unique_ngrams)
+    if len(actual) == 0:
+        return 0.0
+    unique_reference = set(reference)
+    identical_tokens = [token for token in unique_reference if token in actual]
+    precision = len(identical_tokens) / len(unique_reference)
+    return precision
 
 
 def geo_mean(precisions: list[float], max_order: int) -> float | None:
@@ -340,15 +291,12 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     """
     if not isinstance(precisions, list) or not isinstance(max_order, int):
         return None
-
-    summation = float(0)
-
-    for order in range(max_order):
-        if precisions[order] < 0:
-            return 0
-        summation += math.log(precisions[order])
-
-    return math.exp(1 / max_order * summation)
+    if not precisions or max_order <= 0:
+        return None
+    all_precision = 1.0
+    for precision in precisions:
+        all_precision *= precision
+    return float(all_precision**(1.0 / max_order))
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -359,31 +307,25 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
-    if not isinstance(actual, str) or not isinstance(
-            reference, str) or max_order != 3:
+    if (not isinstance(actual, str) or not isinstance(reference, str)
+            or not isinstance(max_order, int)):
         return None
-
-    actual_ngrams = []
-    reference_ngrams = []
-
+    all_ngrams_actual = []
+    all_ngrams_reference = []
     for order in range(max_order):
-        actual_ngram = collect_ngrams(actual, order + 1)
-        reference_ngram = collect_ngrams(reference, order + 1)
-        if actual_ngram is None or reference_ngram is None:
+        ngrams_actual = collect_ngrams(actual, order + 1)
+        ngrams_reference = collect_ngrams(reference, order + 1)
+        if not ngrams_actual or not ngrams_reference:
             return None
-        actual_ngrams.append(actual_ngram)
-        reference_ngrams.append(reference_ngram)
-
+        all_ngrams_actual.append(ngrams_actual)
+        all_ngrams_reference.append(ngrams_reference)
     precisions = []
-
-    for i, j in zip(actual_ngrams, reference_ngrams):
-        precision = calculate_precision(i, j)
-        if precision is None:
+    for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference):
+        presision = calculate_precision(ngrams_actual, ngrams_reference)
+        if not presision:
             return None
-        precisions.append(precision)
-
-    average = geo_mean(precisions, max_order)
-    if average is None:
+        precisions.append(presision)
+    blue_metric = geo_mean(precisions, max_order)
+    if blue_metric is None:
         return None
-
-    return average * 100
+    return blue_metric * 100
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index d71b1c9c4..1cf85801d 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -4,8 +4,7 @@
 import json
 from pathlib import Path
 
-from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode,
-                                        get_vocabulary, train)
+from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode
 
 
 def main() -> None:
@@ -15,37 +14,29 @@ def main() -> None:
     assets_path = Path(__file__).parent / 'assets'
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
-    with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file:
-        encoded_secret = text_file.read()
-    dict_frequencies = collect_frequencies(text, None, '</s>')
-    merged_tokens = train(dict_frequencies, 100)
-    if merged_tokens:
-        vocabulary = get_vocabulary(merged_tokens, '<unk>')
-        secret = [int(num) for num in encoded_secret.split()]
-        result = decode(secret, vocabulary, '</s>')
-        print(result)
-        assert result, "Encoding is not working"
-
-    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
-        predicted = file.read()
     with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file:
         vocabulary = json.load(file)
+    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
+        ru_raw = file.read()
     with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file:
-        actual = file.read()
+        ru_encoded = file.read()
 
-    if [int(token) for token in actual.split()] == encode(
-            predicted, vocabulary, '\u2581', None, '<unk>'):
-        print("Encoding is successful!")
+    encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '<unk>')
+    correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())]
+    if correct_tokens and encode_pred:
+        print((f"Файл закодирован правильно на "
+               f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%"))
 
     with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
         encoded_en = file.read()
     with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file:
-        decoded_en = file.read()
-
-    decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None)
-    decoded = decoded.replace('\u2581', ' ')
+        en_raw = file.read()
 
-    print(calculate_bleu(decoded, decoded_en))
+    decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None)
+    decoded_text = decoded_text.replace('\u2581', ' ')
+    result = calculate_bleu(decoded_text, en_raw)
+    print(f'BLUE = {result}')
+    assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":

From e680ce38993f92dbff17be0cc6c0e25419d074a9 Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:24:32 +0300
Subject: [PATCH 46/81] checkout labs from the origin repository

---
 lab_2_tokenize_by_bpe/main.py  | 308 ++++++++++++++++++++-------------
 lab_2_tokenize_by_bpe/start.py |  39 +++--
 2 files changed, 207 insertions(+), 140 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 99764e3b5..19a72913f 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -3,6 +3,7 @@
 BPE and machine translation evaluation
 """
 import json
+import math
 
 
 def prepare_word(
@@ -19,13 +20,12 @@ def prepare_word(
             start_of_word, str) or start_of_word is None) or not (
             isinstance(end_of_word, str) or end_of_word is None):
         return None
-    tokenized_word = []
-    if start_of_word:
-        tokenized_word.append(start_of_word)
-    tokenized_word.extend(raw_word)
+    list_of_tokens = list(raw_word)
     if end_of_word:
-        tokenized_word.append(end_of_word)
-    return tuple(tokenized_word)
+        list_of_tokens.append(end_of_word)
+    if start_of_word:
+        list_of_tokens.insert(0, start_of_word)
+    return tuple(list_of_tokens)
 
 
 def collect_frequencies(
@@ -41,16 +41,17 @@ def collect_frequencies(
     if not isinstance(text, str) or not isinstance(end_of_word, str) or not (
             isinstance(start_of_word, str) or start_of_word is None):
         return None
-    frequencies_dict = {}
-    for word in text.split():
-        if start_of_word is not None:
-            tokenized_word = prepare_word(word, start_of_word, end_of_word)
-        if start_of_word is None:
-            tokenized_word = prepare_word(word, None, end_of_word)
-            if tokenized_word is None:
-                return None
-            frequencies_dict[tokenized_word] = frequencies_dict.get(tokenized_word, 0) + 1
-    return frequencies_dict
+
+    dict_frequencies = {}
+
+    splitted_text = text.split()
+    for i in set(splitted_text):
+        word = prepare_word(i, start_of_word, end_of_word)
+        if not word:
+            return None
+        dict_frequencies[word] = splitted_text.count(i)
+
+    return dict_frequencies
 
 
 def count_tokens_pairs(
@@ -63,12 +64,17 @@ def count_tokens_pairs(
     """
     if not isinstance(word_frequencies, dict):
         return None
-    pairs_of_tokens = {}
-    for tokens in word_frequencies:
-        for index in range(len(tokens) - 1):
-            pair = (tokens[index], tokens[index + 1])
-            pairs_of_tokens[pair] = pairs_of_tokens.get(pair, 0) + word_frequencies[tokens]
-    return pairs_of_tokens
+
+    dict_with_pairs = {}
+
+    for word in word_frequencies:
+        for index in range(len(word) - 1):
+            pair = (word[index], word[index + 1])
+            if pair not in dict_with_pairs:
+                dict_with_pairs[pair] = 0
+            dict_with_pairs[pair] += word_frequencies[word]
+
+    return dict_with_pairs
 
 
 def merge_tokens(
@@ -80,22 +86,24 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if not (isinstance(word_frequencies, dict)
-            and isinstance(pair, tuple)):
+    if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple):
         return None
-    merged_frequencies = {}
-    for preprocessed_word, count in word_frequencies.items():
-        if ''.join(pair) in ''.join(preprocessed_word):
-            list_word = list(preprocessed_word)
-            for index in range(len(list_word) - 1):
-                if (list_word[index], list_word[index + 1]) == pair:
-                    list_word[index + 1] = ''.join(pair)
-                    list_word[index] = ''
-            if '' in list_word:
-                list_word.remove('')
-            preprocessed_word = tuple(list_word)
-        merged_frequencies[preprocessed_word] = count
-    return merged_frequencies
+    dict_merged_tokens = {}
+    for i in word_frequencies:
+        list_word = list(i)
+
+        for index in range(len(list_word) - 1):
+            if (i[index], i[index + 1]) == pair:
+                list_word[index + 1] = pair[0] + pair[1]
+                list_word[index] = ''
+
+        if '' in list_word:
+            list_word.remove('')
+            dict_merged_tokens.update({tuple(list_word): word_frequencies[i]})
+        else:
+            dict_merged_tokens.update({i: word_frequencies[i]})
+
+    return dict_merged_tokens
 
 
 def train(
@@ -109,20 +117,31 @@ def train(
     """
     if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int):
         return None
-    while num_merges > 0:
-        pairs_of_tokens = count_tokens_pairs(word_frequencies)
-        if not pairs_of_tokens:
-            return None
-        if num_merges > len(pairs_of_tokens):
-            num_merges = len(pairs_of_tokens)
-        pairs_max_values = ([token_pair for token_pair, frequency in pairs_of_tokens.items() if
-                            frequency == max(pairs_of_tokens.values())])
-        sorted_pairs = (sorted(pairs_max_values,
-                               key=lambda pair: (-len(str(pair)), pair)))
+    dict_with_pairs = count_tokens_pairs(word_frequencies)
+
+    if not dict_with_pairs:
+        return None
+    merges = min(num_merges, len(dict_with_pairs))
+
+    for i in range(merges):
+
+        max_values = max(dict_with_pairs.values())
+        pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values]
+
+        max_len = max(len(str(pair)) for pair in pairs_max_values)
+        pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len]
+
+        sorted_pairs = sorted(pairs_max_len)
         word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
+
         if not word_frequencies:
             return None
-        num_merges -= 1
+
+        dict_with_pairs = count_tokens_pairs(word_frequencies)
+
+        if not dict_with_pairs:
+            return None
+
     return word_frequencies
 
 
@@ -137,18 +156,24 @@ def get_vocabulary(
     """
     if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str):
         return None
-    tokens_list = set()
-    dict_token_identifier = {}
-    for tuples in word_frequencies:
-        for token in tuples:
-            tokens_list.add(token)
-            for element in token:
-                tokens_list.update(element)
-    tokens_list.add(unknown_token)
-    sorted_tokens = sorted(tokens_list, key=lambda x: (-len(x), x))
-    for index, token in enumerate(sorted_tokens):
-        dict_token_identifier[token] = index
-    return dict_token_identifier
+
+    dict_ident = {}
+    unique_tokens = set()
+
+    for tuple_tokens in word_frequencies.keys():
+        for word in tuple_tokens:
+            unique_tokens.update(tuple_tokens, word)
+
+    unique_tokens.add(unknown_token)
+    lex_sorted = sorted(unique_tokens)
+    len_sorted = sorted(lex_sorted, key=len, reverse=True)
+    index = 0
+
+    for token in len_sorted:
+        dict_ident[token] = index
+        index += 1
+
+    return dict_ident
 
 
 def decode(
@@ -164,15 +189,17 @@ def decode(
     if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance(
             end_of_word_token, str) or end_of_word_token is None):
         return None
-    decoded_tokens = []
-    for index in encoded_text:
-        for token, token_index in vocabulary.items():
-            if token_index == index and end_of_word_token is not None:
-                decoded_tokens.append(' ' if token == end_of_word_token else token)
-            if vocabulary[token] == index and end_of_word_token is None:
-                decoded_tokens.append('' if token == end_of_word_token else token)
-    decoded_text = ''.join(decoded_tokens)
-    return decoded_text
+    decoded = ''
+    for identifier in encoded_text:
+        token_list = [key for key in vocabulary if vocabulary[key] == identifier]
+
+        for token in token_list:
+            decoded += token
+
+    if end_of_word_token:
+        decoded = decoded.replace(end_of_word_token, ' ')
+
+    return decoded
 
 
 def tokenize_word(
@@ -186,19 +213,27 @@ def tokenize_word(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if (not isinstance(word, tuple) or not all(isinstance(w, str) for w in word)
-            or not isinstance(vocabulary, dict) or not isinstance(
-            end_of_word, (str, type(None))) or not isinstance(unknown_token, str)):
+    if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance(
+            end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str):
         return None
-    word_str = ''.join(word)
-    sorted_tokens = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
-    for token in sorted_tokens:
-        if token in ''.join(word):
-            word_str = word_str.replace(token, str(vocabulary[token]) + ' ')
-    for symbol in ''.join(word):
-        if symbol not in sorted_tokens:
-            word_str = word_str.replace(symbol, str(vocabulary[unknown_token]) + ' ')
-    return [int(identifier) for identifier in word_str.split()]
+
+    word_copy = ''.join(word)
+    sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
+    result = []
+
+    for key in sorted_vocabulary:
+        while key in word_copy:
+            index = word_copy.count(' ', 0, word_copy.find(key))
+            result.insert(index, vocabulary[key])
+            word_copy = word_copy.replace(key, ' ', 1)
+
+    for unk in word_copy:
+        if unk != ' ':
+            index = word_copy.find(unk)
+            word_copy = word_copy.replace(unk, ' ')
+            result.insert(index, vocabulary[unknown_token])
+
+    return result
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -209,11 +244,14 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
     """
     if not isinstance(vocab_path, str):
         return None
+
     with open(vocab_path, 'r', encoding='utf-8') as f:
-        vocabulary = json.load(f)
-    if not isinstance(vocabulary, dict):
+        vocab = json.load(f)
+
+    if not isinstance(vocab, dict):
         return None
-    return vocabulary
+
+    return vocab
 
 
 def encode(
@@ -232,20 +270,26 @@ def encode(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if not isinstance(original_text, str) or not isinstance(vocabulary, dict) or not isinstance(
+    if not isinstance(original_text, str) or not isinstance(
+            vocabulary, dict) or not (isinstance(
+            start_of_word_token, str) or start_of_word_token is None) or not (isinstance(
+            end_of_word_token, str) or end_of_word_token is None) or not isinstance(
             unknown_token, str):
         return None
-    list_token_identifiers = []
-    text = original_text.split()
-    for word in text:
-        prepared_word = prepare_word(word, start_of_word_token, end_of_word_token)
-        if not prepared_word:
+
+    encoded = []
+    split_text = original_text.split()
+
+    for word in split_text:
+        prepared = prepare_word(word, start_of_word_token, end_of_word_token)
+        if not prepared:
             return None
-        tokens_id = tokenize_word(prepared_word, vocabulary, end_of_word_token, unknown_token)
-        if not tokens_id:
+        result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token)
+        if not result:
             return None
-        list_token_identifiers.extend(tokens_id)
-    return list_token_identifiers
+        encoded.extend(result)
+
+    return encoded
 
 
 def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
@@ -257,10 +301,12 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
     """
     if not isinstance(text, str) or not isinstance(order, int):
         return None
-    sequence_ngrams = []
+
+    n_grams = []
     for index in range(len(text) + 1 - order):
-        sequence_ngrams.append(tuple(text[index:order+index]))
-    return sequence_ngrams
+        n_grams.append(tuple(text[index: index + order]))
+
+    return n_grams
 
 
 def calculate_precision(
@@ -274,12 +320,15 @@ def calculate_precision(
     """
     if not isinstance(actual, list) or not isinstance(reference, list):
         return None
-    if len(actual) == 0:
-        return 0.0
-    unique_reference = set(reference)
-    identical_tokens = [token for token in unique_reference if token in actual]
-    precision = len(identical_tokens) / len(unique_reference)
-    return precision
+
+    unique_ngrams = set(reference)
+    matches = 0
+
+    for n_gram in unique_ngrams:
+        if n_gram in actual:
+            matches += 1
+
+    return matches / len(unique_ngrams)
 
 
 def geo_mean(precisions: list[float], max_order: int) -> float | None:
@@ -291,12 +340,15 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     """
     if not isinstance(precisions, list) or not isinstance(max_order, int):
         return None
-    if not precisions or max_order <= 0:
-        return None
-    all_precision = 1.0
-    for precision in precisions:
-        all_precision *= precision
-    return float(all_precision**(1.0 / max_order))
+
+    summation = float(0)
+
+    for order in range(max_order):
+        if precisions[order] < 0:
+            return 0
+        summation += math.log(precisions[order])
+
+    return math.exp(1 / max_order * summation)
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -307,25 +359,31 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
-    if (not isinstance(actual, str) or not isinstance(reference, str)
-            or not isinstance(max_order, int)):
+    if not isinstance(actual, str) or not isinstance(
+            reference, str) or max_order != 3:
         return None
-    all_ngrams_actual = []
-    all_ngrams_reference = []
+
+    actual_ngrams = []
+    reference_ngrams = []
+
     for order in range(max_order):
-        ngrams_actual = collect_ngrams(actual, order + 1)
-        ngrams_reference = collect_ngrams(reference, order + 1)
-        if not ngrams_actual or not ngrams_reference:
+        actual_ngram = collect_ngrams(actual, order + 1)
+        reference_ngram = collect_ngrams(reference, order + 1)
+        if actual_ngram is None or reference_ngram is None:
             return None
-        all_ngrams_actual.append(ngrams_actual)
-        all_ngrams_reference.append(ngrams_reference)
+        actual_ngrams.append(actual_ngram)
+        reference_ngrams.append(reference_ngram)
+
     precisions = []
-    for ngrams_actual, ngrams_reference in zip(all_ngrams_actual, all_ngrams_reference):
-        presision = calculate_precision(ngrams_actual, ngrams_reference)
-        if not presision:
+
+    for i, j in zip(actual_ngrams, reference_ngrams):
+        precision = calculate_precision(i, j)
+        if precision is None:
             return None
-        precisions.append(presision)
-    blue_metric = geo_mean(precisions, max_order)
-    if blue_metric is None:
+        precisions.append(precision)
+
+    average = geo_mean(precisions, max_order)
+    if average is None:
         return None
-    return blue_metric * 100
+
+    return average * 100
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 1cf85801d..d71b1c9c4 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -4,7 +4,8 @@
 import json
 from pathlib import Path
 
-from lab_2_tokenize_by_bpe.main import calculate_bleu, decode, encode
+from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode,
+                                        get_vocabulary, train)
 
 
 def main() -> None:
@@ -14,29 +15,37 @@ def main() -> None:
     assets_path = Path(__file__).parent / 'assets'
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
+    with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file:
+        encoded_secret = text_file.read()
+    dict_frequencies = collect_frequencies(text, None, '</s>')
+    merged_tokens = train(dict_frequencies, 100)
+    if merged_tokens:
+        vocabulary = get_vocabulary(merged_tokens, '<unk>')
+        secret = [int(num) for num in encoded_secret.split()]
+        result = decode(secret, vocabulary, '</s>')
+        print(result)
+        assert result, "Encoding is not working"
+
+    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
+        predicted = file.read()
     with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file:
         vocabulary = json.load(file)
-    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
-        ru_raw = file.read()
     with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file:
-        ru_encoded = file.read()
+        actual = file.read()
 
-    encode_pred = encode(ru_raw, vocabulary, '\u2581', None, '<unk>')
-    correct_tokens = [token for token in encode_pred if token in map(int, ru_encoded.split())]
-    if correct_tokens and encode_pred:
-        print((f"Файл закодирован правильно на "
-               f"{(len(list(correct_tokens)) / len(list(encode_pred))*100)}%"))
+    if [int(token) for token in actual.split()] == encode(
+            predicted, vocabulary, '\u2581', None, '<unk>'):
+        print("Encoding is successful!")
 
     with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
         encoded_en = file.read()
     with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file:
-        en_raw = file.read()
+        decoded_en = file.read()
+
+    decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None)
+    decoded = decoded.replace('\u2581', ' ')
 
-    decoded_text = decode([int(num) for num in encoded_en.split()], vocabulary, None)
-    decoded_text = decoded_text.replace('\u2581', ' ')
-    result = calculate_bleu(decoded_text, en_raw)
-    print(f'BLUE = {result}')
-    assert result, "Encoding is not working"
+    print(calculate_bleu(decoded, decoded_en))
 
 
 if __name__ == "__main__":

From 5217fa8a5e35f26458911f6185fc8ea581eee0ed Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 7 Nov 2023 13:36:11 +0300
Subject: [PATCH 47/81] changes for 4

---
 lab_3_generate_by_ngrams/main.py | 86 ++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index dcf4e8af9..591f7b815 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -23,6 +23,8 @@ def __init__(self, end_of_word_token: str) -> None:
         Args:
             end_of_word_token (str): A token denoting word boundary
         """
+        self._end_of_word_token = end_of_word_token
+        self._storage = {'_': 0}
 
     def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         """
@@ -41,6 +43,19 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(text, str) or text == '':
+            return None
+
+        list_text = text.split()
+        str_text = self._end_of_word_token.join(list_text)
+        new_str = ''
+        for token in str_text:
+            if token.isalpha() or token == '_':
+                new_str += token
+        new_str += '_'
+        if '__' in new_str:
+            copy_str = new_str.replace('__', '_')
+            return tuple([token.lower() for token in copy_str])
 
     def get_id(self, element: str) -> Optional[int]:
         """
@@ -55,6 +70,10 @@ def get_id(self, element: str) -> Optional[int]:
         In case of corrupt input arguments or arguments not included in storage,
         None is returned
         """
+        if not isinstance(element, str) or element not in self._storage:
+            return None
+
+        return self._storage[element]
 
     def get_end_of_word_token(self) -> str:
         """
@@ -63,6 +82,7 @@ def get_end_of_word_token(self) -> str:
         Returns:
             str: EoW token
         """
+        return self._end_of_word_token
 
     def get_token(self, element_id: int) -> Optional[str]:
         """
@@ -76,6 +96,12 @@ def get_token(self, element_id: int) -> Optional[str]:
 
         In case of corrupt input arguments or arguments not included in storage, None is returned
         """
+        if not isinstance(element_id, int) or element_id not in self._storage.values():
+            return None
+
+        for key in self._storage:
+            if self._storage[key] == element_id:
+                return key
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """
@@ -93,6 +119,21 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(text, str) or len(text) == 0:
+            return None
+
+        encoded_text = []
+        tokenized_text = self._tokenize(text)
+        if not tokenized_text:
+            return None
+
+        for token in tokenized_text:
+            self._put(token)
+            element_id = self.get_id(token)
+            if not isinstance(element_id, int):
+                return None
+            encoded_text.append(element_id)
+        return tuple(encoded_text)
 
     def _put(self, element: str) -> None:
         """
@@ -104,6 +145,11 @@ def _put(self, element: str) -> None:
         In case of corrupt input arguments or invalid argument length,
         an element is not added to storage
         """
+        if not isinstance(element, str) or len(element) != 1:
+            return None
+        if element not in self._storage:
+            self._storage[element] = len(self._storage)
+
 
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
@@ -121,6 +167,16 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(encoded_corpus, tuple) or not encoded_corpus:
+            return None
+
+        decoded_tokens = self._decode(encoded_corpus)
+        if not decoded_tokens:
+            return None
+        decoded_text = self._postprocess_decoded_text(decoded_tokens)
+        if not decoded_text:
+            return None
+        return decoded_text
 
     def fill_from_ngrams(self, content: dict) -> None:
         """
@@ -129,6 +185,12 @@ def fill_from_ngrams(self, content: dict) -> None:
         Args:
             content (dict): ngrams from external JSON
         """
+        if not isinstance(content, dict) or not content:
+            return None
+        for key in content['freq']:
+            for element in key:
+                if element.isalpha():
+                    self._put(element)
 
     def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
         """
@@ -143,6 +205,18 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(corpus, tuple) or not corpus:
+            return None
+
+        list_corpus = []
+        for element_id in corpus:
+            if not isinstance(element_id, int):
+                return None
+            token = self.get_token(element_id)
+            if not token:
+                return None
+            list_corpus.append(token)
+        return tuple(list_corpus)
 
     def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]:
         """
@@ -159,6 +233,18 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
 
         In case of corrupt input arguments, None is returned
         """
+        if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
+            return None
+
+        decoded_text = decoded_corpus[0].upper()
+        for token in decoded_corpus[1:-1]:
+            if token == self._end_of_word_token:
+                decoded_text += ' '
+            else:
+                decoded_text += token
+        if decoded_corpus[-1] != self._end_of_word_token:
+            decoded_text += decoded_corpus[-1]
+        return decoded_text + '.'
 
 
 class NGramLanguageModel:

From 8426d68918b3edec6b26ca38cd2e7ef101e3bfd1 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 7 Nov 2023 13:37:10 +0300
Subject: [PATCH 48/81] start

---
 lab_3_generate_by_ngrams/start.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index b9bcbd999..c04e32b39 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -1,6 +1,7 @@
 """
 Generation by NGrams starter
 """
+from lab_3_generate_by_ngrams.main import TextProcessor
 
 
 def main() -> None:
@@ -11,8 +12,11 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    result = None
-    assert result
+    corpus = TextProcessor('_')
+    encoded_text = corpus.encode(text)
+    decoded_text = corpus.decode(encoded_text)
+    print(encoded_text)
+    print(decoded_text)
 
 
 if __name__ == "__main__":

From d396d1ec27dee0cd919413ed0e1c386c3c0b9f61 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 7 Nov 2023 13:40:20 +0300
Subject: [PATCH 49/81] score

---
 lab_3_generate_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index 573541ac9..b8626c4cf 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-0
+4

From f6ed07b1b71a6314816cdff367f45387bb4036b1 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 7 Nov 2023 13:45:25 +0300
Subject: [PATCH 50/81] start

---
 lab_3_generate_by_ngrams/start.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index c04e32b39..7eee1289d 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -14,9 +14,10 @@ def main() -> None:
         text = text_file.read()
     corpus = TextProcessor('_')
     encoded_text = corpus.encode(text)
-    decoded_text = corpus.decode(encoded_text)
+    result = corpus.decode(encoded_text)
     print(encoded_text)
-    print(decoded_text)
+    print(result)
+    assert result
 
 
 if __name__ == "__main__":

From ed8da3ef22b9ffa4d834a170eea2490ac4a8b703 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 10 Nov 2023 09:29:19 +0300
Subject: [PATCH 51/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 591f7b815..1148c1de4 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -150,7 +150,6 @@ def _put(self, element: str) -> None:
         if element not in self._storage:
             self._storage[element] = len(self._storage)
 
-
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
         Decode and postprocess encoded corpus by converting integer identifiers to string.
@@ -266,6 +265,7 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None:
             n_gram_size (int): A size of n-grams to use for language modelling
         """
 
+
     def get_n_gram_size(self) -> int:
         """
         Retrieve value stored in self._n_gram_size attribute.

From baab5b64a81a975ccc6f49f69696f2f2dacf1f3b Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 15 Nov 2023 13:24:00 +0300
Subject: [PATCH 52/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 68 +++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 1148c1de4..5ba7f8431 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -24,7 +24,7 @@ def __init__(self, end_of_word_token: str) -> None:
             end_of_word_token (str): A token denoting word boundary
         """
         self._end_of_word_token = end_of_word_token
-        self._storage = {'_': 0}
+        self._storage = {end_of_word_token: 0}
 
     def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         """
@@ -43,7 +43,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
-        if not isinstance(text, str) or text == '':
+        if not isinstance(text, str) or not text:
             return None
 
         list_text = text.split()
@@ -264,6 +264,9 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None:
             encoded_corpus (tuple): Encoded text
             n_gram_size (int): A size of n-grams to use for language modelling
         """
+        self._n_gram_size = n_gram_size
+        self._n_gram_frequencies = {}
+        self._encoded_corpus = encoded_corpus
 
 
     def get_n_gram_size(self) -> int:
@@ -273,6 +276,7 @@ def get_n_gram_size(self) -> int:
         Returns:
             int: Size of stored n_grams
         """
+        return self._n_gram_size
 
     def set_n_grams(self, frequencies: dict) -> None:
         """
@@ -281,6 +285,10 @@ def set_n_grams(self, frequencies: dict) -> None:
         Args:
             frequencies (dict): Computed in advance frequencies for n-grams
         """
+        if not isinstance(frequencies, dict) or len(frequencies) == 0:
+            return None
+        self._n_gram_frequencies = frequencies
+        return None
 
     def build(self) -> int:
         """
@@ -294,6 +302,21 @@ def build(self) -> int:
         In case of corrupt input arguments or methods used return None,
         1 is returned
         """
+        if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0:
+            return 1
+
+        n_grams = self._extract_n_grams(self._encoded_corpus)
+        if not n_grams:
+            return 1
+
+        for n_gram in set(n_grams):
+            if not isinstance(n_gram, tuple):
+                return 1
+            absolute_frequency = n_grams.count(n_gram)
+            with_same_beginning = len([id for id in n_grams if
+                                       id[:-1] == n_gram[:-1]])
+            self._n_gram_frequencies[n_gram] = absolute_frequency / with_same_beginning
+        return 0
 
     def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
         """
@@ -307,6 +330,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
 
         In case of corrupt input arguments, None is returned
         """
+        if (not isinstance(sequence, tuple) or len(sequence) == 0
+                or len(sequence) >= self._n_gram_size - 1):
+            return None
+
+        tokens = {}
+        for ngram, freq in self._n_gram_frequencies.items():
+            if sequence[-self._n_gram_size + 1:] == ngram[:self._n_gram_size - 1]:
+                tokens[self._n_gram_size - 1] = freq
+        return tokens
 
     def _extract_n_grams(
         self, encoded_corpus: tuple[int, ...]
@@ -322,6 +354,15 @@ def _extract_n_grams(
 
         In case of corrupt input arguments, None is returned
         """
+        if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0:
+            return None
+
+        n_grams = []
+        list_encoded_corpus = list(encoded_corpus)
+        for index in range(len(encoded_corpus) + 1 - self._n_gram_size):
+            n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size]))
+
+        return tuple(n_grams)
 
 
 class GreedyTextGenerator:
@@ -341,6 +382,8 @@ def __init__(self, language_model: NGramLanguageModel, text_processor: TextProce
             language_model (NGramLanguageModel): A language model to use for text generation
             text_processor (TextProcessor): A TextProcessor instance to handle text processing
         """
+        self._model = language_model
+        self._text_processor = text_processor
 
     def run(self, seq_len: int, prompt: str) -> Optional[str]:
         """
@@ -356,6 +399,27 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
         In case of corrupt input arguments or methods used return None,
         None is returned
         """
+        if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0:
+            return None
+
+        n_gram_size = self._model.get_n_gram_size()
+        encoded = self._text_processor.encode(prompt)
+        if not encoded or not n_gram_size:
+            return None
+
+        while seq_len > 0:
+            candidates = self._model.generate_next_token(encoded[-n_gram_size + 1:])
+            if not candidates:
+                break
+            best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())]
+            max_freq_letters = sorted(best_candidate)
+            encoded += (max_freq_letters[0])
+            seq_len -= 1
+        decoded_prompt = self._text_processor.decode(encoded)
+        if decoded_prompt is None:
+            return None
+        return decoded_prompt
+
 
 
 class BeamSearcher:

From b5a6ad703af8980de55f3e7f739150798e9e7da1 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 15 Nov 2023 13:26:19 +0300
Subject: [PATCH 53/81] score

---
 lab_3_generate_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index b8626c4cf..1e8b31496 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-4
+6

From 449f2ace1283a1b0eb3c463b20fb5cf8efbe810d Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 15 Nov 2023 16:44:22 +0300
Subject: [PATCH 54/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 5ba7f8431..99ba842da 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -43,19 +43,21 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
-        if not isinstance(text, str) or not text:
+        if not isinstance(text, str):
             return None
 
-        list_text = text.split()
-        str_text = self._end_of_word_token.join(list_text)
-        new_str = ''
-        for token in str_text:
-            if token.isalpha() or token == '_':
-                new_str += token
-        new_str += '_'
-        if '__' in new_str:
-            copy_str = new_str.replace('__', '_')
-            return tuple([token.lower() for token in copy_str])
+        tokens = []
+        list_text = text.lower().split()
+        for element in list_text:
+            word = [token for token in element if token.isalpha()]
+            if word:
+                tokens.extend(word)
+                tokens.append(self._end_of_word_token)
+        if not tokens:
+            return None
+        if text[-1].isalnum():
+            tokens.pop()
+        return tuple(tokens)
 
     def get_id(self, element: str) -> Optional[int]:
         """

From 99153b2759e6bdc9f21ecfa0ba1dbf85c99c556e Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Fri, 17 Nov 2023 09:28:25 +0300
Subject: [PATCH 55/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 99ba842da..b43df6f59 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -290,7 +290,6 @@ def set_n_grams(self, frequencies: dict) -> None:
         if not isinstance(frequencies, dict) or len(frequencies) == 0:
             return None
         self._n_gram_frequencies = frequencies
-        return None
 
     def build(self) -> int:
         """
@@ -337,8 +336,8 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
             return None
 
         tokens = {}
-        for ngram, freq in self._n_gram_frequencies.items():
-            if sequence[-self._n_gram_size + 1:] == ngram[:self._n_gram_size - 1]:
+        for n_gram, freq in self._n_gram_frequencies.items():
+            if sequence[-self._n_gram_size + 1:] == n_gram[:self._n_gram_size - 1]:
                 tokens[self._n_gram_size - 1] = freq
         return tokens
 

From 30c7852af0ae85180146a398709523e0711d64c0 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 21 Nov 2023 09:26:03 +0300
Subject: [PATCH 56/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 55 ++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index b43df6f59..1585dc499 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -4,6 +4,7 @@
 Beam-search and natural language generation evaluation
 """
 # pylint:disable=too-few-public-methods
+import math
 from typing import Optional
 
 
@@ -101,8 +102,8 @@ def get_token(self, element_id: int) -> Optional[str]:
         if not isinstance(element_id, int) or element_id not in self._storage.values():
             return None
 
-        for key in self._storage:
-            if self._storage[key] == element_id:
+        for key, value in self._storage.items():
+            if value == element_id:
                 return key
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
@@ -151,6 +152,7 @@ def _put(self, element: str) -> None:
             return None
         if element not in self._storage:
             self._storage[element] = len(self._storage)
+        return None
 
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
@@ -331,14 +333,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
 
         In case of corrupt input arguments, None is returned
         """
-        if (not isinstance(sequence, tuple) or len(sequence) == 0
-                or len(sequence) >= self._n_gram_size - 1):
+        if (not isinstance(sequence, tuple) or not sequence
+                or len(sequence) < self._n_gram_size - 1):
             return None
 
+        context = sequence[-self._n_gram_size + 1:]
         tokens = {}
         for n_gram, freq in self._n_gram_frequencies.items():
-            if sequence[-self._n_gram_size + 1:] == n_gram[:self._n_gram_size - 1]:
-                tokens[self._n_gram_size - 1] = freq
+            if n_gram[:len(context)] == context:
+                tokens[n_gram[len(context)]] = freq
         return tokens
 
     def _extract_n_grams(
@@ -440,6 +443,8 @@ def __init__(self, beam_width: int, language_model: NGramLanguageModel) -> None:
             beam_width (int): Number of candidates to consider at each step
             language_model (NGramLanguageModel): A language model to use for next token prediction
         """
+        self._beam_width = beam_width
+        self._model = language_model
 
     def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, float]]]:
         """
@@ -460,6 +465,17 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
 
         In case of corrupt input arguments or methods used return None.
         """
+        if not isinstance(sequence, tuple) or len(sequence) == 0:
+            return None
+
+        tokens_dict = self._model.generate_next_token(sequence)
+        if tokens_dict is None:
+            return None
+        if not tokens_dict:
+            return []
+
+        return sorted([(token, float(probability)) for token, probability in tokens_dict.items()],
+                        key=lambda x: x[1], reverse=True)[:self._beam_width]
 
     def continue_sequence(
         self,
@@ -482,6 +498,28 @@ def continue_sequence(
 
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
+        if (not isinstance(sequence, tuple) or not isinstance(next_tokens, list) or
+                not isinstance(sequence_candidates, dict) or not sequence):
+            return None
+        if (not next_tokens or not sequence_candidates or
+                sequence not in sequence_candidates or
+                len(next_tokens) > self._beam_width):
+            return None
+
+        new_sequence_candidates = {}
+        for key, value in sequence_candidates.items():
+            if key != sequence:
+                probability = value
+                for token, token_probability in next_tokens:
+                    new_sequence = key + (token,)
+                    new_probability = probability + (-1) * math.log(token_probability)
+                    new_sequence_candidates[new_sequence] = new_probability
+
+        if len(new_sequence_candidates) > self._beam_width:
+            new_sequence_candidates = dict(
+                sorted(new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width])
+
+        return new_sequence_candidates
 
     def prune_sequence_candidates(
         self, sequence_candidates: dict[tuple[int, ...], float]
@@ -497,6 +535,11 @@ def prune_sequence_candidates(
 
         In case of corrupt input arguments return None.
         """
+        if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0:
+            return None
+
+        sorted_candidates = dict(sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0]), reverse=True))
+        return dict(list(sorted_candidates.items())[:self._beam_width])
 
 
 class BeamSearchTextGenerator:

From 60617709ec3c1f6b155f0ac2c5a3304f54d96de3 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 22 Nov 2023 22:17:12 +0300
Subject: [PATCH 57/81] changes for 8

---
 lab_3_generate_by_ngrams/main.py | 53 +++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 1585dc499..e9a8655cf 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -412,17 +412,14 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
             return None
 
         while seq_len > 0:
-            candidates = self._model.generate_next_token(encoded[-n_gram_size + 1:])
+            candidates = self._model.generate_next_token(encoded)
             if not candidates:
                 break
             best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())]
             max_freq_letters = sorted(best_candidate)
-            encoded += (max_freq_letters[0])
+            encoded += (max_freq_letters[0],)
             seq_len -= 1
-        decoded_prompt = self._text_processor.decode(encoded)
-        if decoded_prompt is None:
-            return None
-        return decoded_prompt
+        return self._text_processor.decode(encoded)
 
 
 
@@ -475,7 +472,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
             return []
 
         return sorted([(token, float(probability)) for token, probability in tokens_dict.items()],
-                        key=lambda x: x[1], reverse=True)[:self._beam_width]
+                      key=lambda x: x[1], reverse=True)[:self._beam_width]
 
     def continue_sequence(
         self,
@@ -567,6 +564,10 @@ def __init__(
             text_processor (TextProcessor): A TextProcessor instance to handle text processing
             beam_width (int): Beam width parameter for generation
         """
+        self._language_model = language_model
+        self._text_processor = text_processor
+        self._beam_width = beam_width
+        self.beam_searcher = BeamSearcher(beam_width, language_model)
 
     def run(self, prompt: str, seq_len: int) -> Optional[str]:
         """
@@ -582,6 +583,37 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
         In case of corrupt input arguments or methods used return None,
         None is returned
         """
+        if (not isinstance(prompt, str) or not isinstance(seq_len, int)
+                or not prompt or not seq_len):
+            return None
+
+        encoded_prompt = self._text_processor.encode(prompt)
+        if encoded_prompt is None:
+            return None
+
+        candidates = {encoded_prompt: 0.0}
+        for i in range(seq_len):
+            new_sequence_candidates = dict(candidates)
+            for sequence in candidates:
+                next_tokens = self._get_next_token(sequence)
+                if not next_tokens:
+                    return None
+
+                continued_candidates = (self.beam_searcher.continue_sequence(
+                    sequence, next_tokens, new_sequence_candidates))
+                if not continued_candidates:
+                    break
+
+            best_sequence_candidates = self.beam_searcher.prune_sequence_candidates(
+                new_sequence_candidates)
+
+            if not best_sequence_candidates:
+                return None
+            sequence_candidates = best_sequence_candidates
+
+        decoded = self._text_processor.decode(min(candidates,
+                                                  key=lambda x: sequence_candidates[x]))
+        return decoded
 
     def _get_next_token(
         self, sequence_to_continue: tuple[int, ...]
@@ -598,6 +630,13 @@ def _get_next_token(
 
         In case of corrupt input arguments return None.
         """
+        if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0:
+            return None
+
+        next_tokens = self.beam_searcher.get_next_token(sequence_to_continue)
+        if next_tokens is None:
+            return None
+        return next_tokens
 
 
 class NGramLanguageModelReader:

From 45bcbe2c96f1499f6c1116743f098601077c7436 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 22 Nov 2023 22:19:03 +0300
Subject: [PATCH 58/81] mark 8

---
 lab_3_generate_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index 1e8b31496..45a4fb75d 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-6
+8

From 36860717fdbf24a95d09a27f053a750bf28b50df Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 22 Nov 2023 23:05:21 +0300
Subject: [PATCH 59/81] some changes

---
 lab_3_generate_by_ngrams/main.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index e9a8655cf..a27f35553 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -105,6 +105,7 @@ def get_token(self, element_id: int) -> Optional[str]:
         for key, value in self._storage.items():
             if value == element_id:
                 return key
+        return None
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """
@@ -194,6 +195,7 @@ def fill_from_ngrams(self, content: dict) -> None:
             for element in key:
                 if element.isalpha():
                     self._put(element)
+        return None
 
     def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
         """
@@ -272,7 +274,6 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None:
         self._n_gram_frequencies = {}
         self._encoded_corpus = encoded_corpus
 
-
     def get_n_gram_size(self) -> int:
         """
         Retrieve value stored in self._n_gram_size attribute.
@@ -292,6 +293,7 @@ def set_n_grams(self, frequencies: dict) -> None:
         if not isinstance(frequencies, dict) or len(frequencies) == 0:
             return None
         self._n_gram_frequencies = frequencies
+        return None
 
     def build(self) -> int:
         """
@@ -415,14 +417,14 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
             candidates = self._model.generate_next_token(encoded)
             if not candidates:
                 break
-            best_candidate = [letter for letter, freq in candidates.items() if freq == max(candidates.values())]
+            best_candidate = ([letter for letter, freq in candidates.items()
+                               if freq == max(candidates.values())])
             max_freq_letters = sorted(best_candidate)
             encoded += (max_freq_letters[0],)
             seq_len -= 1
         return self._text_processor.decode(encoded)
 
 
-
 class BeamSearcher:
     """
     Beam Search algorithm for diverse text generation.
@@ -513,9 +515,8 @@ def continue_sequence(
                     new_sequence_candidates[new_sequence] = new_probability
 
         if len(new_sequence_candidates) > self._beam_width:
-            new_sequence_candidates = dict(
-                sorted(new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width])
-
+            new_sequence_candidates = dict(sorted(new_sequence_candidates.items(),
+                                                  key=lambda x: x[1], reverse=True)[:self._beam_width])
         return new_sequence_candidates
 
     def prune_sequence_candidates(
@@ -535,7 +536,8 @@ def prune_sequence_candidates(
         if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0:
             return None
 
-        sorted_candidates = dict(sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0]), reverse=True))
+        sorted_candidates = dict(sorted(sequence_candidates.items(),
+                                        key=lambda x: (x[1], x[0]), reverse=True))
         return dict(list(sorted_candidates.items())[:self._beam_width])
 
 

From a9a62ee156343361d29b777274ede34f392c931d Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 22 Nov 2023 23:15:08 +0300
Subject: [PATCH 60/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index a27f35553..0536c6c89 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -641,6 +641,7 @@ def _get_next_token(
         return next_tokens
 
 
+
 class NGramLanguageModelReader:
     """
     Factory for loading language models ngrams from external JSON.

From 03d9c2f86ab5e39650ccd3f42677db9a253fa79f Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 22 Nov 2023 23:26:35 +0300
Subject: [PATCH 61/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 0536c6c89..767f77fd1 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -102,10 +102,8 @@ def get_token(self, element_id: int) -> Optional[str]:
         if not isinstance(element_id, int) or element_id not in self._storage.values():
             return None
 
-        for key, value in self._storage.items():
-            if value == element_id:
-                return key
-        return None
+        token = list(filter(lambda x: x[1] == element_id, self._storage.items()))
+        return token[0][0]
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """
@@ -177,10 +175,11 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         decoded_tokens = self._decode(encoded_corpus)
         if not decoded_tokens:
             return None
-        decoded_text = self._postprocess_decoded_text(decoded_tokens)
-        if not decoded_text:
+
+        if not self._postprocess_decoded_text(decoded_tokens):
             return None
-        return decoded_text
+
+        return self._postprocess_decoded_text(decoded_tokens)
 
     def fill_from_ngrams(self, content: dict) -> None:
         """
@@ -515,8 +514,8 @@ def continue_sequence(
                     new_sequence_candidates[new_sequence] = new_probability
 
         if len(new_sequence_candidates) > self._beam_width:
-            new_sequence_candidates = dict(sorted(new_sequence_candidates.items(),
-                                                  key=lambda x: x[1], reverse=True)[:self._beam_width])
+            new_sequence_candidates = dict(sorted(
+                new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width])
         return new_sequence_candidates
 
     def prune_sequence_candidates(

From fd83d60fc7980338e72deacc1eabe3e9a99efffb Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 23 Nov 2023 19:29:01 +0300
Subject: [PATCH 62/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 56 +++++++++++++-------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 767f77fd1..155e7d552 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -366,7 +366,6 @@ def _extract_n_grams(
         list_encoded_corpus = list(encoded_corpus)
         for index in range(len(encoded_corpus) + 1 - self._n_gram_size):
             n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size]))
-
         return tuple(n_grams)
 
 
@@ -496,27 +495,18 @@ def continue_sequence(
 
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
-        if (not isinstance(sequence, tuple) or not isinstance(next_tokens, list) or
-                not isinstance(sequence_candidates, dict) or not sequence):
-            return None
-        if (not next_tokens or not sequence_candidates or
-                sequence not in sequence_candidates or
-                len(next_tokens) > self._beam_width):
+        if not (isinstance(sequence, tuple) and isinstance(next_tokens, list)
+                and isinstance(sequence_candidates, dict) and sequence
+                and next_tokens and sequence_candidates and len(next_tokens) <= self._beam_width
+                and sequence in sequence_candidates):
             return None
 
-        new_sequence_candidates = {}
-        for key, value in sequence_candidates.items():
-            if key != sequence:
-                probability = value
-                for token, token_probability in next_tokens:
-                    new_sequence = key + (token,)
-                    new_probability = probability + (-1) * math.log(token_probability)
-                    new_sequence_candidates[new_sequence] = new_probability
-
-        if len(new_sequence_candidates) > self._beam_width:
-            new_sequence_candidates = dict(sorted(
-                new_sequence_candidates.items(), key=lambda x: x[1], reverse=True)[:self._beam_width])
-        return new_sequence_candidates
+        for token_tuple in next_tokens:
+            new_sequence = sequence + (token_tuple[0],)
+            new_freq = sequence_candidates[sequence] - math.log(token_tuple[1])
+            sequence_candidates[new_sequence] = new_freq
+        sequence_candidates.pop(sequence)
+        return sequence_candidates
 
     def prune_sequence_candidates(
         self, sequence_candidates: dict[tuple[int, ...], float]
@@ -532,12 +522,11 @@ def prune_sequence_candidates(
 
         In case of corrupt input arguments return None.
         """
-        if not isinstance(sequence_candidates, dict) or len(sequence_candidates) == 0:
+        if not isinstance(sequence_candidates, dict) or not sequence_candidates:
             return None
 
-        sorted_candidates = dict(sorted(sequence_candidates.items(),
-                                        key=lambda x: (x[1], x[0]), reverse=True))
-        return dict(list(sorted_candidates.items())[:self._beam_width])
+        sorted_candidates = sorted(sequence_candidates.items(), key=lambda x: (x[1], x[0]))
+        return dict(sorted_candidates[:self._beam_width])
 
 
 class BeamSearchTextGenerator:
@@ -601,20 +590,19 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
                     return None
 
                 continued_candidates = (self.beam_searcher.continue_sequence(
-                    sequence, next_tokens, new_sequence_candidates))
+                                      sequence, next_tokens, new_sequence_candidates))
                 if not continued_candidates:
                     break
 
-            best_sequence_candidates = self.beam_searcher.prune_sequence_candidates(
-                new_sequence_candidates)
+                best_sequence = self.beam_searcher.prune_sequence_candidates(
+                                      new_sequence_candidates)
 
-            if not best_sequence_candidates:
-                return None
-            sequence_candidates = best_sequence_candidates
-
-        decoded = self._text_processor.decode(min(candidates,
-                                                  key=lambda x: sequence_candidates[x]))
-        return decoded
+                if best_sequence is None:
+                    return None
+                candidates = best_sequence
+        best_candidate = sorted([candidate for candidate, probability in candidates.items() if
+                                 probability == min(candidates.values())])[0]
+        return self._text_processor.decode(best_candidate)
 
     def _get_next_token(
         self, sequence_to_continue: tuple[int, ...]

From e85ad1d0bd15b7b1d2172904b554e3cdde48b2e4 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 23 Nov 2023 20:17:31 +0300
Subject: [PATCH 63/81] added fixes

---
 lab_3_generate_by_ngrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 155e7d552..39c986fe8 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -169,7 +169,7 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
-        if not isinstance(encoded_corpus, tuple) or not encoded_corpus:
+        if not isinstance(encoded_corpus, tuple):
             return None
 
         decoded_tokens = self._decode(encoded_corpus)

From 8592c3bb0cc6c0dbdb3aceef7916edbfe1bed818 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 23 Nov 2023 20:29:22 +0300
Subject: [PATCH 64/81] start

---
 lab_3_generate_by_ngrams/start.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index 7eee1289d..187c185cb 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -1,7 +1,8 @@
 """
 Generation by NGrams starter
 """
-from lab_3_generate_by_ngrams.main import TextProcessor
+from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
+                                           NGramLanguageModel, TextProcessor)
 
 
 def main() -> None:
@@ -12,12 +13,25 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    corpus = TextProcessor('_')
-    encoded_text = corpus.encode(text)
-    result = corpus.decode(encoded_text)
-    print(encoded_text)
-    print(result)
-    assert result
+    processor = TextProcessor('_')
+    encoded = processor.encode(text)
+
+    if encoded:
+        result = processor.decode(encoded)
+
+        print(result)
+
+        model_for_build = NGramLanguageModel(encoded[:10], 2)
+        print(model_for_build.build())
+
+        model = NGramLanguageModel(encoded, 7)
+        greedy_text_generator = GreedyTextGenerator(model, processor)
+        print(greedy_text_generator.run(51, 'Vernon'))
+
+        beam_search_generator = BeamSearchTextGenerator(model, processor, 7)
+        print(beam_search_generator.run('Vernon', 56))
+
+        assert result
 
 
 if __name__ == "__main__":

From 9067d6d79170f8d484e920809040922fed55aa1d Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 23 Nov 2023 22:44:45 +0300
Subject: [PATCH 65/81] some changes

---
 lab_3_generate_by_ngrams/main.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 39c986fe8..419764d27 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -50,7 +50,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         tokens = []
         list_text = text.lower().split()
         for element in list_text:
-            word = [token for token in element if token.isalpha()]
+            word = list(filter(str.isalpha, element))
             if word:
                 tokens.extend(word)
                 tokens.append(self._end_of_word_token)
@@ -147,9 +147,7 @@ def _put(self, element: str) -> None:
         In case of corrupt input arguments or invalid argument length,
         an element is not added to storage
         """
-        if not isinstance(element, str) or len(element) != 1:
-            return None
-        if element not in self._storage:
+        if isinstance(element, str) and len(element) == 1 and element not in self._storage:
             self._storage[element] = len(self._storage)
         return None
 
@@ -190,10 +188,10 @@ def fill_from_ngrams(self, content: dict) -> None:
         """
         if not isinstance(content, dict) or not content:
             return None
-        for key in content['freq']:
-            for element in key:
-                if element.isalpha():
-                    self._put(element)
+        for n_gram in content['freq']:
+            for token in n_gram:
+                if token.isalpha():
+                    self._put(token)
         return None
 
     def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
@@ -243,12 +241,12 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
         decoded_text = decoded_corpus[0].upper()
         for token in decoded_corpus[1:-1]:
             if token == self._end_of_word_token:
-                decoded_text += ' '
+                decoded_text = f'{decoded_text} '
             else:
                 decoded_text += token
         if decoded_corpus[-1] != self._end_of_word_token:
             decoded_text += decoded_corpus[-1]
-        return decoded_text + '.'
+        return f'{decoded_text}.'
 
 
 class NGramLanguageModel:
@@ -365,7 +363,8 @@ def _extract_n_grams(
         n_grams = []
         list_encoded_corpus = list(encoded_corpus)
         for index in range(len(encoded_corpus) + 1 - self._n_gram_size):
-            n_grams.append(tuple(list_encoded_corpus[index: index + self._n_gram_size]))
+            n_gram = tuple(list_encoded_corpus[index: index + self._n_gram_size])
+            n_grams.append(n_gram)
         return tuple(n_grams)
 
 
@@ -415,8 +414,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
             candidates = self._model.generate_next_token(encoded)
             if not candidates:
                 break
-            best_candidate = ([letter for letter, freq in candidates.items()
-                               if freq == max(candidates.values())])
+            max_freq = max(candidates.values())
+            best_candidate = list(filter(lambda x: candidates[x] == max_freq, candidates))
             max_freq_letters = sorted(best_candidate)
             encoded += (max_freq_letters[0],)
             seq_len -= 1

From 3262c60e019420e5bba8165a363c4be621d2d7aa Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Thu, 23 Nov 2023 23:04:51 +0300
Subject: [PATCH 66/81] changes

---
 lab_3_generate_by_ngrams/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 419764d27..834b9379e 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -149,7 +149,6 @@ def _put(self, element: str) -> None:
         """
         if isinstance(element, str) and len(element) == 1 and element not in self._storage:
             self._storage[element] = len(self._storage)
-        return None
 
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
@@ -415,7 +414,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
             if not candidates:
                 break
             max_freq = max(candidates.values())
-            best_candidate = list(filter(lambda x: candidates[x] == max_freq, candidates))
+            best_candidate = ([letter for letter, freq in candidates.items()
+                               if freq == max_freq])
             max_freq_letters = sorted(best_candidate)
             encoded += (max_freq_letters[0],)
             seq_len -= 1

From 034b0f87367cddedbe2696c382a8cb37e13af381 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 29 Nov 2023 10:28:41 +0300
Subject: [PATCH 67/81] second step

---
 lab_4_fill_words_by_ngrams/main.py | 31 ++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index b739ae182..8e061a918 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -28,6 +28,18 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(text, str) or len(text) == 0:
+            raise ValueError
+        text_words = text.lower().split()
+
+        tokens = []
+        for word in text_words:
+            if word[-1] in '!?.':
+                tokens.extend([word[:len(word) - 1], self._end_of_word_token])
+            elif word.isalpha():
+                tokens.append(word)
+
+        return tuple(tokens)
 
     def _put(self, element: str) -> None:
         """
@@ -39,6 +51,13 @@ def _put(self, element: str) -> None:
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(element, str) or len(element) == 0:
+            raise ValueError
+
+        if element not in self._storage:
+            self._storage[element] = len(self._storage)
+
+        return None
 
     def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  # type: ignore
         """
@@ -56,6 +75,18 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
+            raise ValueError
+
+        words_list = list(decoded_corpus)
+        sentences = (' '.join(words_list)).split('<eos>')
+        decoded_text = ''
+        for i, sentence in enumerate(sentences):
+            sentence = sentence.strip().capitalize()
+            decoded_text += f'{sentence}. '
+        if decoded_corpus[-1] == '<eos>':
+            return decoded_text[:len(decoded_text) - 2].strip()
+        return decoded_text.strip()
 
 
 class TopPGenerator:

From f3cd6f8af42e9227806840327221ea2d65ab23dd Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 6 Dec 2023 21:34:24 +0300
Subject: [PATCH 68/81] changes for 6

---
 lab_4_fill_words_by_ngrams/main.py | 38 ++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 8e061a918..8fdbd38d1 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -4,6 +4,8 @@
 Top-p sampling generation and filling gaps with ngrams
 """
 # pylint:disable=too-few-public-methods, too-many-arguments
+import random
+
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
 
@@ -79,12 +81,12 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
             raise ValueError
 
         words_list = list(decoded_corpus)
-        sentences = (' '.join(words_list)).split('<eos>')
+        sentences = (' '.join(words_list)).split(self._end_of_word_token)
         decoded_text = ''
         for i, sentence in enumerate(sentences):
             sentence = sentence.strip().capitalize()
             decoded_text += f'{sentence}. '
-        if decoded_corpus[-1] == '<eos>':
+        if decoded_corpus[-1] == self._end_of_word_token:
             return decoded_text[:len(decoded_text) - 2].strip()
         return decoded_text.strip()
 
@@ -111,6 +113,9 @@ def __init__(
             word_processor (WordProcessor): WordProcessor instance to handle text processing
             p_value (float): Collective probability mass threshold
         """
+        self._model = language_model
+        self._word_processor = word_processor
+        self._p_value = p_value
 
     def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         """
@@ -129,6 +134,35 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
                 or if sequence has inappropriate length,
                 or if methods used return None.
         """
+        if not (isinstance(seq_len, int) and seq_len > 0
+                and isinstance(prompt, str) and prompt):
+            raise ValueError
+        encoded = self._word_processor.encode(prompt)
+        if not encoded:
+            raise ValueError
+
+        for i in range(seq_len):
+            next_tokens = self._model.generate_next_token(encoded)
+            if next_tokens is None:
+                raise ValueError
+            if not next_tokens:
+                break
+
+            sorted_dict = dict(sorted(list(next_tokens.items()), key=lambda x: (x[1], x[0]), reverse=True))
+            probability = 0
+            possible_tokens = ()
+            for word, value in sorted_dict.items():
+                probability += value
+                possible_tokens += (word,)
+                if probability >= self._p_value:
+                    break
+            encoded += (random.choice(possible_tokens),)
+
+        decoded = self._word_processor.decode(encoded)
+        if not decoded:
+            raise ValueError
+
+        return decoded
 
 
 class GeneratorTypes:

From 3e3f063a170558e22463ee977e31dc366a1e5240 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 6 Dec 2023 21:36:04 +0300
Subject: [PATCH 69/81] score 6

---
 lab_4_fill_words_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt
index 573541ac9..1e8b31496 100644
--- a/lab_4_fill_words_by_ngrams/target_score.txt
+++ b/lab_4_fill_words_by_ngrams/target_score.txt
@@ -1 +1 @@
-0
+6

From bc5e5536340c3fab9e4b47a4d8628db9b24a9794 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Wed, 6 Dec 2023 21:41:32 +0300
Subject: [PATCH 70/81] start

---
 lab_4_fill_words_by_ngrams/start.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index c41386377..cc77770a4 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,6 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
+from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor
 
 
 def main() -> None:
@@ -10,7 +11,13 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    result = None
+    word_processor = WordProcessor('<eow>')
+    encoded_text = word_processor.encode(text)
+    model = NGramLanguageModel(encoded_text, 2)
+    model.build()
+    top_p = TopPGenerator(model, word_processor, 0.5)
+    result = top_p.run(51, 'Vernon')
+    print(result)
     assert result
 
 

From 5098f1a9613c70d00d0e26a3b2f9b094f1b7f55a Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 12 Dec 2023 10:28:36 +0300
Subject: [PATCH 71/81] changes for 8

---
 lab_4_fill_words_by_ngrams/main.py | 88 +++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 7 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 8fdbd38d1..552308fa6 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -5,6 +5,8 @@
 """
 # pylint:disable=too-few-public-methods, too-many-arguments
 import random
+import math
+import json
 
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
@@ -30,16 +32,17 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
-        if not isinstance(text, str) or len(text) == 0:
+        if not isinstance(text, str) or not text:
             raise ValueError
-        text_words = text.lower().split()
 
         tokens = []
-        for word in text_words:
+        for word in text.lower().split():
             if word[-1] in '!?.':
                 tokens.extend([word[:len(word) - 1], self._end_of_word_token])
-            elif word.isalpha():
-                tokens.append(word)
+            else:
+                cleaned_word = [letter for letter in word if letter.isalpha()]
+                if cleaned_word:
+                    tokens.append(''.join(cleaned_word))
 
         return tuple(tokens)
 
@@ -134,8 +137,8 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
                 or if sequence has inappropriate length,
                 or if methods used return None.
         """
-        if not (isinstance(seq_len, int) and seq_len > 0
-                and isinstance(prompt, str) and prompt):
+        if (not isinstance(seq_len, int) or not isinstance(prompt, str)
+                or seq_len <= 0):
             raise ValueError
         encoded = self._word_processor.encode(prompt)
         if not encoded:
@@ -179,6 +182,9 @@ def __init__(self) -> None:
         """
         Initialize an instance of GeneratorTypes.
         """
+        self.greedy = 0
+        self.top_p = 1
+        self.beam_search = 2
 
     def get_conversion_generator_type(self, generator_type: int) -> str:  # type: ignore
         """
@@ -190,6 +196,8 @@ def get_conversion_generator_type(self, generator_type: int) -> str:  # type: ig
         Returns:
             (str): Name of the generator.
         """
+        generators = ['Greedy Generator', 'Top-P Generator', 'Beam Search Generator']
+        return generators[generator_type]
 
 
 class GenerationResultDTO:
@@ -212,6 +220,9 @@ def __init__(self, text: str, perplexity: float, generation_type: int):
             generation_type (int):
                 Numeric type of the generator for which perplexity was calculated
         """
+        self.__text = text
+        self.__perplexity = perplexity
+        self.__type = generation_type
 
     def get_perplexity(self) -> float:  # type: ignore
         """
@@ -220,6 +231,7 @@ def get_perplexity(self) -> float:  # type: ignore
         Returns:
             (float): Perplexity value
         """
+        return self.__perplexity
 
     def get_text(self) -> str:  # type: ignore
         """
@@ -228,6 +240,7 @@ def get_text(self) -> str:  # type: ignore
         Returns:
             (str): Text for which the perplexity was count
         """
+        return self.__text
 
     def get_type(self) -> int:  # type: ignore
         """
@@ -236,6 +249,7 @@ def get_type(self) -> int:  # type: ignore
         Returns:
             (int): Numeric type of the generator
         """
+        return self.__type
 
     def __str__(self) -> str:  # type: ignore
         """
@@ -244,6 +258,9 @@ def __str__(self) -> str:  # type: ignore
         Returns:
             (str): String with report
         """
+        return (f'Perplexity score: {self.__perplexity}\n'
+                f'{GeneratorTypes().get_conversion_generator_type(self.__type)}\n'
+                f'Text: {self.__text}\n')
 
 
 class QualityChecker:
@@ -268,6 +285,9 @@ def __init__(
                 NGramLanguageModel instance to use for text generation
             word_processor (WordProcessor): WordProcessor instance to handle text processing
         """
+        self._generators = generators
+        self._language_model = language_model
+        self._word_processor = word_processor
 
     def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
         """
@@ -285,6 +305,27 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
                 or if methods used return None,
                 or if nothing was generated.
         """
+        if not isinstance(generated_text, str) or not generated_text:
+            raise ValueError
+
+        encoded = self._word_processor.encode(generated_text)
+        if not encoded:
+            raise ValueError
+
+        ngram_size = self._language_model.get_n_gram_size()
+        log_prob_sum = 0.0
+        for index in range(ngram_size - 1, len(encoded)):
+            context = tuple(encoded[index - ngram_size + 1: index])
+            next_tokens = self._language_model.generate_next_token(context)
+            if not next_tokens:
+                raise ValueError
+
+            prob = next_tokens.get(encoded[index])
+            if prob:
+                log_prob_sum += math.log(prob)
+        if not log_prob_sum:
+            raise ValueError
+        return math.exp(-log_prob_sum / (len(encoded) - ngram_size))
 
     def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type: ignore
         """
@@ -304,6 +345,20 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type:
                 or if sequence has inappropriate length,
                 or if methods used return None.
         """
+        if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt:
+            raise ValueError("Incorrect input")
+        results = []
+        for num_type, generator in self._generators.items():
+            text = generator.run(prompt=prompt, seq_len=seq_len)
+            if not text:
+                raise ValueError
+
+            perplexity = self._calculate_perplexity(text)
+            if not perplexity:
+                raise ValueError
+
+            results.append(GenerationResultDTO(text, perplexity, num_type))
+        return sorted(results, key=lambda item: (perplexity, num_type))
 
 
 class Examiner:
@@ -323,6 +378,8 @@ def __init__(self, json_path: str) -> None:
         Args:
             json_path (str): Local path to assets file
         """
+        self._json_path = json_path
+        self._questions_and_answers = self._load_from_json()
 
     def _load_from_json(self) -> dict[tuple[str, int], str]:  # type: ignore
         """
@@ -338,6 +395,15 @@ def _load_from_json(self) -> dict[tuple[str, int], str]:  # type: ignore
                 or if attribute _json_path has inappropriate extension,
                 or if inappropriate type loaded data.
         """
+        if (not isinstance(self._json_path, str) or not self._json_path
+                or self._json_path[-5:] != ".json"):
+            raise ValueError
+
+        with open(self._json_path, 'r', encoding='utf-8') as file:
+            question_and_answers = json.load(file)
+            if not isinstance(question_and_answers, list):
+                raise ValueError
+        return {(i['question'], i['location']): i['answer'] for i in question_and_answers}
 
     def provide_questions(self) -> list[tuple[str, int]]:  # type: ignore
         """
@@ -347,6 +413,7 @@ def provide_questions(self) -> list[tuple[str, int]]:  # type: ignore
             list[tuple[str, int]]:
                 List in the form of [(question, position of the word to be filled)]
         """
+        return list(self._questions_and_answers.keys())
 
     def assess_exam(self, answers: dict[str, str]) -> float:  # type: ignore
         """
@@ -361,6 +428,13 @@ def assess_exam(self, answers: dict[str, str]) -> float:  # type: ignore
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(answers, dict) or not answers:
+            raise ValueError
+
+        right_answers = ([key for key in self._questions_and_answers.keys()
+                          if answers[key[0]] == self._questions_and_answers[key]])
+
+        return len(right_answers) / len(list(self._questions_and_answers.values()))
 
 
 class GeneratorRuleStudent:

From b6cff90f63c64d325e0a9713737f2995d5eea007 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 12 Dec 2023 10:31:17 +0300
Subject: [PATCH 72/81] mark 8

---
 lab_4_fill_words_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt
index 1e8b31496..45a4fb75d 100644
--- a/lab_4_fill_words_by_ngrams/target_score.txt
+++ b/lab_4_fill_words_by_ngrams/target_score.txt
@@ -1 +1 @@
-6
+8

From 5fc6613e21f71deeb413c44d50cfb6dd41677979 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 12 Dec 2023 12:43:16 +0300
Subject: [PATCH 73/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 552308fa6..812d4b8f3 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -4,9 +4,9 @@
 Top-p sampling generation and filling gaps with ngrams
 """
 # pylint:disable=too-few-public-methods, too-many-arguments
-import random
-import math
 import json
+import math
+import random
 
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
@@ -33,7 +33,7 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(text, str) or not text:
-            raise ValueError
+            raise ValueError('WordProcessor._tokenize: Incorrect input')
 
         tokens = []
         for word in text.lower().split():
@@ -57,13 +57,11 @@ def _put(self, element: str) -> None:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(element, str) or len(element) == 0:
-            raise ValueError
+            raise ValueError('WordProcessor._put: Incorrect input')
 
         if element not in self._storage:
             self._storage[element] = len(self._storage)
 
-        return None
-
     def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  # type: ignore
         """
         Convert decoded sentence into the string sequence.
@@ -81,7 +79,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
-            raise ValueError
+            raise ValueError('WordProcessor._postprocess_decoded_text: Incorrect input')
 
         words_list = list(decoded_corpus)
         sentences = (' '.join(words_list)).split(self._end_of_word_token)
@@ -139,7 +137,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         """
         if (not isinstance(seq_len, int) or not isinstance(prompt, str)
                 or seq_len <= 0):
-            raise ValueError
+            raise ValueError("TopPGenerator.run: Incorrect input")
         encoded = self._word_processor.encode(prompt)
         if not encoded:
             raise ValueError
@@ -151,7 +149,8 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             if not next_tokens:
                 break
 
-            sorted_dict = dict(sorted(list(next_tokens.items()), key=lambda x: (x[1], x[0]), reverse=True))
+            sorted_dict = dict(sorted(list(next_tokens.items()),
+                                      key=lambda x: (x[1], x[0]), reverse=True))
             probability = 0
             possible_tokens = ()
             for word, value in sorted_dict.items():
@@ -306,7 +305,7 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
                 or if nothing was generated.
         """
         if not isinstance(generated_text, str) or not generated_text:
-            raise ValueError
+            raise ValueError("QualityChecker._calculate_perplexity: Incorrect input")
 
         encoded = self._word_processor.encode(generated_text)
         if not encoded:
@@ -346,7 +345,8 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type:
                 or if methods used return None.
         """
         if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt:
-            raise ValueError("Incorrect input")
+            raise ValueError("QualityChecker.run: Incorrect input")
+
         results = []
         for num_type, generator in self._generators.items():
             text = generator.run(prompt=prompt, seq_len=seq_len)
@@ -397,7 +397,7 @@ def _load_from_json(self) -> dict[tuple[str, int], str]:  # type: ignore
         """
         if (not isinstance(self._json_path, str) or not self._json_path
                 or self._json_path[-5:] != ".json"):
-            raise ValueError
+            raise ValueError("Examiner._load_from_json: Incorrect input")
 
         with open(self._json_path, 'r', encoding='utf-8') as file:
             question_and_answers = json.load(file)
@@ -429,7 +429,7 @@ def assess_exam(self, answers: dict[str, str]) -> float:  # type: ignore
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(answers, dict) or not answers:
-            raise ValueError
+            raise ValueError("Examiner._load_from_json: Incorrect input")
 
         right_answers = ([key for key in self._questions_and_answers.keys()
                           if answers[key[0]] == self._questions_and_answers[key]])

From ce220580b078118bcadf09321cd124322b15eaef Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 12 Dec 2023 14:15:53 +0300
Subject: [PATCH 74/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 812d4b8f3..1a205279d 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -140,7 +140,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             raise ValueError("TopPGenerator.run: Incorrect input")
         encoded = self._word_processor.encode(prompt)
         if not encoded:
-            raise ValueError
+            raise ValueError("TopPGenerator.run: Encoded is None")
 
         for i in range(seq_len):
             next_tokens = self._model.generate_next_token(encoded)
@@ -152,7 +152,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             sorted_dict = dict(sorted(list(next_tokens.items()),
                                       key=lambda x: (x[1], x[0]), reverse=True))
             probability = 0
-            possible_tokens = ()
+            possible_tokens = tuple()
             for word, value in sorted_dict.items():
                 probability += value
                 possible_tokens += (word,)
@@ -309,7 +309,7 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
 
         encoded = self._word_processor.encode(generated_text)
         if not encoded:
-            raise ValueError
+            raise ValueError("QualityChecker._calculate_perplexity: Encoded is None")
 
         ngram_size = self._language_model.get_n_gram_size()
         log_prob_sum = 0.0
@@ -317,13 +317,13 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
             context = tuple(encoded[index - ngram_size + 1: index])
             next_tokens = self._language_model.generate_next_token(context)
             if not next_tokens:
-                raise ValueError
+                raise ValueError("QualityChecker._calculate_perplexity: Next_tokens is None")
 
             prob = next_tokens.get(encoded[index])
             if prob:
                 log_prob_sum += math.log(prob)
         if not log_prob_sum:
-            raise ValueError
+            raise ValueError("QualityChecker._calculate_perplexity: Log_prob_sum is None")
         return math.exp(-log_prob_sum / (len(encoded) - ngram_size))
 
     def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type: ignore
@@ -351,11 +351,11 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type:
         for num_type, generator in self._generators.items():
             text = generator.run(prompt=prompt, seq_len=seq_len)
             if not text:
-                raise ValueError
+                raise ValueError("QualityChecker.run: Text is None")
 
             perplexity = self._calculate_perplexity(text)
             if not perplexity:
-                raise ValueError
+                raise ValueError("QualityChecker.run: Perplexity is None")
 
             results.append(GenerationResultDTO(text, perplexity, num_type))
         return sorted(results, key=lambda item: (perplexity, num_type))
@@ -402,7 +402,7 @@ def _load_from_json(self) -> dict[tuple[str, int], str]:  # type: ignore
         with open(self._json_path, 'r', encoding='utf-8') as file:
             question_and_answers = json.load(file)
             if not isinstance(question_and_answers, list):
-                raise ValueError
+                raise ValueError("Examiner._load_from_json: Question_and_answers is None")
         return {(i['question'], i['location']): i['answer'] for i in question_and_answers}
 
     def provide_questions(self) -> list[tuple[str, int]]:  # type: ignore

From f96b2ad9233ec4e4aba86c2b318a888bff31aa9f Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Tue, 12 Dec 2023 14:28:30 +0300
Subject: [PATCH 75/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 1a205279d..3f50c33c5 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -149,7 +149,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             if not next_tokens:
                 break
 
-            sorted_dict = dict(sorted(list(next_tokens.items()),
+            sorted_dict = dict(sorted(next_tokens.items(),
                                       key=lambda x: (x[1], x[0]), reverse=True))
             probability = 0
             possible_tokens = tuple()

From 67eb3173c5fbbaba7edd7eccd7db0a0d32e48025 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sat, 16 Dec 2023 23:37:04 +0300
Subject: [PATCH 76/81] changes for 10

---
 lab_4_fill_words_by_ngrams/main.py | 62 +++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 3f50c33c5..daf4db8e8 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -33,11 +33,12 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(text, str) or not text:
-            raise ValueError('WordProcessor._tokenize: Incorrect input')
+            raise ValueError('Incorrect input')
 
         tokens = []
+        punctuation = '!?.'
         for word in text.lower().split():
-            if word[-1] in '!?.':
+            if word[-1] in punctuation:
                 tokens.extend([word[:len(word) - 1], self._end_of_word_token])
             else:
                 cleaned_word = [letter for letter in word if letter.isalpha()]
@@ -57,7 +58,7 @@ def _put(self, element: str) -> None:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(element, str) or len(element) == 0:
-            raise ValueError('WordProcessor._put: Incorrect input')
+            raise ValueError('Incorrect input')
 
         if element not in self._storage:
             self._storage[element] = len(self._storage)
@@ -79,7 +80,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
-            raise ValueError('WordProcessor._postprocess_decoded_text: Incorrect input')
+            raise ValueError('Incorrect input')
 
         words_list = list(decoded_corpus)
         sentences = (' '.join(words_list)).split(self._end_of_word_token)
@@ -137,10 +138,10 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         """
         if (not isinstance(seq_len, int) or not isinstance(prompt, str)
                 or seq_len <= 0):
-            raise ValueError("TopPGenerator.run: Incorrect input")
+            raise ValueError("Incorrect input")
         encoded = self._word_processor.encode(prompt)
         if not encoded:
-            raise ValueError("TopPGenerator.run: Encoded is None")
+            raise ValueError("Encoded is None")
 
         for i in range(seq_len):
             next_tokens = self._model.generate_next_token(encoded)
@@ -195,8 +196,12 @@ def get_conversion_generator_type(self, generator_type: int) -> str:  # type: ig
         Returns:
             (str): Name of the generator.
         """
-        generators = ['Greedy Generator', 'Top-P Generator', 'Beam Search Generator']
-        return generators[generator_type]
+        types = {
+            self.greedy: 'Greedy Generator',
+            self.top_p: 'Top-P Generator',
+            self.beam_search: 'Beam Search Generator'
+        }
+        return types[generator_type]
 
 
 class GenerationResultDTO:
@@ -305,11 +310,11 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
                 or if nothing was generated.
         """
         if not isinstance(generated_text, str) or not generated_text:
-            raise ValueError("QualityChecker._calculate_perplexity: Incorrect input")
+            raise ValueError('Incorrect input')
 
         encoded = self._word_processor.encode(generated_text)
         if not encoded:
-            raise ValueError("QualityChecker._calculate_perplexity: Encoded is None")
+            raise ValueError('Encoded is None')
 
         ngram_size = self._language_model.get_n_gram_size()
         log_prob_sum = 0.0
@@ -317,13 +322,13 @@ def _calculate_perplexity(self, generated_text: str) -> float:  # type: ignore
             context = tuple(encoded[index - ngram_size + 1: index])
             next_tokens = self._language_model.generate_next_token(context)
             if not next_tokens:
-                raise ValueError("QualityChecker._calculate_perplexity: Next_tokens is None")
+                raise ValueError('Next_tokens is None')
 
             prob = next_tokens.get(encoded[index])
             if prob:
                 log_prob_sum += math.log(prob)
         if not log_prob_sum:
-            raise ValueError("QualityChecker._calculate_perplexity: Log_prob_sum is None")
+            raise ValueError('Log_prob_sum is None')
         return math.exp(-log_prob_sum / (len(encoded) - ngram_size))
 
     def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type: ignore
@@ -345,17 +350,17 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]:  # type:
                 or if methods used return None.
         """
         if not isinstance(seq_len, int) or seq_len < 0 or not isinstance(prompt, str) or not prompt:
-            raise ValueError("QualityChecker.run: Incorrect input")
+            raise ValueError('Incorrect input')
 
         results = []
         for num_type, generator in self._generators.items():
             text = generator.run(prompt=prompt, seq_len=seq_len)
             if not text:
-                raise ValueError("QualityChecker.run: Text is None")
+                raise ValueError('Text is None')
 
             perplexity = self._calculate_perplexity(text)
             if not perplexity:
-                raise ValueError("QualityChecker.run: Perplexity is None")
+                raise ValueError('Perplexity is None')
 
             results.append(GenerationResultDTO(text, perplexity, num_type))
         return sorted(results, key=lambda item: (perplexity, num_type))
@@ -397,12 +402,12 @@ def _load_from_json(self) -> dict[tuple[str, int], str]:  # type: ignore
         """
         if (not isinstance(self._json_path, str) or not self._json_path
                 or self._json_path[-5:] != ".json"):
-            raise ValueError("Examiner._load_from_json: Incorrect input")
+            raise ValueError('Incorrect input')
 
         with open(self._json_path, 'r', encoding='utf-8') as file:
             question_and_answers = json.load(file)
             if not isinstance(question_and_answers, list):
-                raise ValueError("Examiner._load_from_json: Question_and_answers is None")
+                raise ValueError('Question_and_answers is None')
         return {(i['question'], i['location']): i['answer'] for i in question_and_answers}
 
     def provide_questions(self) -> list[tuple[str, int]]:  # type: ignore
@@ -429,7 +434,7 @@ def assess_exam(self, answers: dict[str, str]) -> float:  # type: ignore
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
         if not isinstance(answers, dict) or not answers:
-            raise ValueError("Examiner._load_from_json: Incorrect input")
+            raise ValueError('Incorrect input')
 
         right_answers = ([key for key in self._questions_and_answers.keys()
                           if answers[key[0]] == self._questions_and_answers[key]])
@@ -457,6 +462,11 @@ def __init__(
                 NGramLanguageModel instance to use for text generation
             word_processor (WordProcessor): WordProcessor instance to handle text processing
         """
+        self._generator_type = generator_type
+        generators = (GreedyTextGenerator(language_model, word_processor),
+                      TopPGenerator(language_model, word_processor, 0.5),
+                      BeamSearchTextGenerator(language_model, word_processor, 5))
+        self._generator = generators[self._generator_type]
 
     def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]:  # type: ignore
         """
@@ -474,6 +484,20 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]:  # type: ig
                 or if input argument is empty,
                 or if methods used return None.
         """
+        if not isinstance(tasks, list) or not tasks:
+            raise ValueError('Incorrect input')
+
+        answers = {}
+        for (question, position) in tasks:
+            next_sequence = self._generator.run(seq_len=1, prompt=question[:position])
+            if not next_sequence:
+                raise ValueError
+
+            if next_sequence[-1] == '.':
+                next_sequence = next_sequence[:-1] + ' '
+            answers.update({question: next_sequence + question[position:]})
+
+        return answers
 
     def get_generator_type(self) -> str:  # type: ignore
         """
@@ -482,3 +506,5 @@ def get_generator_type(self) -> str:  # type: ignore
         Returns:
             str: Generator type
         """
+        generator = GeneratorTypes()
+        return generator.get_conversion_generator_type(self._generator_type)

From 8399714958ee3149284b4eb5860722d4d2ca7c25 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sat, 16 Dec 2023 23:39:10 +0300
Subject: [PATCH 77/81] mark 10

---
 lab_4_fill_words_by_ngrams/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt
index 45a4fb75d..f599e28b8 100644
--- a/lab_4_fill_words_by_ngrams/target_score.txt
+++ b/lab_4_fill_words_by_ngrams/target_score.txt
@@ -1 +1 @@
-8
+10

From 165bf2677a625c0c05aa98b290dcf6b4401f0528 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sun, 17 Dec 2023 11:23:31 +0300
Subject: [PATCH 78/81] start

---
 lab_4_fill_words_by_ngrams/start.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index cc77770a4..eb45aac37 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,7 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
-from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor
+import lab_4_fill_words_by_ngrams.main as main_py
 
 
 def main() -> None:
@@ -11,13 +11,30 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    word_processor = WordProcessor('<eow>')
+    word_processor = main_py.WordProcessor('<eow>')
     encoded_text = word_processor.encode(text)
-    model = NGramLanguageModel(encoded_text, 2)
+    model = main_py.NGramLanguageModel(encoded_text, 2)
     model.build()
-    top_p = TopPGenerator(model, word_processor, 0.5)
-    result = top_p.run(51, 'Vernon')
-    print(result)
+    top_p = main_py.TopPGenerator(model, word_processor, 0.5)
+    top_p_result = top_p.run(51, 'Vernon')
+    print(top_p_result)
+    generator_types = main_py.GeneratorTypes()
+    generators = {generator_types.top_p: main_py.TopPGenerator(model, word_processor, 0.5),
+                  generator_types.beam_search:
+                      main_py.BeamSearchTextGenerator(model, word_processor, 5)}
+    quality_check = main_py.QualityChecker(generators, model, word_processor)
+    quality_result = quality_check.run(100, 'The')
+    print(quality_result)
+    examiner = main_py.Examiner('./assets/question_and_answers.json')
+    questions = examiner.provide_questions()
+    students = [main_py.GeneratorRuleStudent(i, model, word_processor) for i in range(3)]
+    for student in students:
+        answers = student.take_exam(questions)
+        result = examiner.assess_exam(answers)
+        generator_type = student.get_generator_type()
+        print('Type of generator:', generator_type)
+        print('Answers:', ''.join(answers.values()))
+        print('Accuracy:', result)
     assert result
 
 

From b713d63d536e22f7cf820c89962769c3ac03e3c0 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sun, 17 Dec 2023 12:08:26 +0300
Subject: [PATCH 79/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index daf4db8e8..f98a347aa 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -185,6 +185,11 @@ def __init__(self) -> None:
         self.greedy = 0
         self.top_p = 1
         self.beam_search = 2
+        self._types = {
+            self.greedy: 'Greedy Generator',
+            self.top_p: 'Top-P Generator',
+            self.beam_search: 'Beam Search Generator'
+        }
 
     def get_conversion_generator_type(self, generator_type: int) -> str:  # type: ignore
         """
@@ -196,12 +201,7 @@ def get_conversion_generator_type(self, generator_type: int) -> str:  # type: ig
         Returns:
             (str): Name of the generator.
         """
-        types = {
-            self.greedy: 'Greedy Generator',
-            self.top_p: 'Top-P Generator',
-            self.beam_search: 'Beam Search Generator'
-        }
-        return types[generator_type]
+        return self._types[generator_type]
 
 
 class GenerationResultDTO:
@@ -491,7 +491,7 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]:  # type: ig
         for (question, position) in tasks:
             next_sequence = self._generator.run(seq_len=1, prompt=question[:position])
             if not next_sequence:
-                raise ValueError
+                raise ValueError('Next sequence is None')
 
             if next_sequence[-1] == '.':
                 next_sequence = next_sequence[:-1] + ' '

From 0e7f98f7e5ddb264160cbb2f079b8e203ff1901f Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Sun, 17 Dec 2023 12:10:12 +0300
Subject: [PATCH 80/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index f98a347aa..07763cca9 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -163,7 +163,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
 
         decoded = self._word_processor.decode(encoded)
         if not decoded:
-            raise ValueError
+            raise ValueError('Decoded is None')
 
         return decoded
 

From a48fe2620316e8ba0175fe2670cccbce8d4f0590 Mon Sep 17 00:00:00 2001
From: mmarina <m.marina.oc14@mail.ru>
Date: Mon, 18 Dec 2023 15:06:38 +0300
Subject: [PATCH 81/81] corrections

---
 lab_4_fill_words_by_ngrams/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 07763cca9..daa2d8d86 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -57,7 +57,7 @@ def _put(self, element: str) -> None:
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
-        if not isinstance(element, str) or len(element) == 0:
+        if not isinstance(element, str) or not element:
             raise ValueError('Incorrect input')
 
         if element not in self._storage:
@@ -79,7 +79,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
-        if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
+        if not isinstance(decoded_corpus, tuple) or not decoded_corpus:
             raise ValueError('Incorrect input')
 
         words_list = list(decoded_corpus)
@@ -146,7 +146,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         for i in range(seq_len):
             next_tokens = self._model.generate_next_token(encoded)
             if next_tokens is None:
-                raise ValueError
+                raise ValueError('Next tokens are None')
             if not next_tokens:
                 break