From c1d362bab23beebfbdf380372d8e05e51dd7bf60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 8 Sep 2023 18:56:35 +0300
Subject: [PATCH 01/68] my first commit

---
 main.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 main.py

diff --git a/main.py b/main.py
new file mode 100644
index 000000000..8ea4c734f
--- /dev/null
+++ b/main.py
@@ -0,0 +1 @@
+print ('Hello world!')
\ No newline at end of file

From ac7d44be03634acd1bae3cd58c6008c800e54add Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 8 Sep 2023 18:59:49 +0300
Subject: [PATCH 02/68] my second commit

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 8ea4c734f..7175d0851 100644
--- a/main.py
+++ b/main.py
@@ -1 +1 @@
-print ('Hello world!')
\ No newline at end of file
+print ('Hello world!1')
\ No newline at end of file

From d4d0735fc90cd2d170dc5485b59dfc699a2e3791 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sat, 9 Sep 2023 15:12:14 +0300
Subject: [PATCH 03/68] my second commit

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 7175d0851..1f8916570 100644
--- a/main.py
+++ b/main.py
@@ -1 +1 @@
-print ('Hello world!1')
\ No newline at end of file
+print ('Hello world!11')
\ No newline at end of file

From 7b609f5808b3ff82017a41253cafba9457715505 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 22 Sep 2023 11:05:35 +0300
Subject: [PATCH 04/68] file deleted

---
 main.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 main.py

diff --git a/main.py b/main.py
deleted file mode 100644
index 1f8916570..000000000
--- a/main.py
+++ /dev/null
@@ -1 +0,0 @@
-print ('Hello world!11')
\ No newline at end of file

From 833c00a32296383e2f644094e99aa9efe2e66078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 26 Sep 2023 16:14:13 +0300
Subject: [PATCH 05/68] file deleted

---
 lab_1_classify_by_unigrams/main.py          | 11 ++++++++++-
 lab_1_classify_by_unigrams/start.py         |  6 ++++++
 lab_1_classify_by_unigrams/target_score.txt |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 486b3d65c..113970407 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -11,7 +11,16 @@ def tokenize(text: str) -> list[str] | None:
     :param text: a text
     :return: a list of lower-cased tokens without punctuation
     """
-
+    if text is str == False:
+        return None
+    else:
+        list_of_tokens = ""
+        for token in text:
+            if token.isalpha():
+                new_token = token.lower()
+                list_of_tokens += new_token
+        tokens = list(list_of_tokens)
+        return(tokens)
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """
diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index db7a1a904..ffb91de3d 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -2,6 +2,8 @@
 Language detection starter
 """
 
+import lab_1_classify_by_unigrams.main
+
 
 def main() -> None:
     """
@@ -9,10 +11,14 @@ def main() -> None:
     """
     with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
         en_text = file_to_read_en.read()
+        en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text)
+        print(en_tokens)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
+        de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text)
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
+        unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text)
     result = None
     assert result, "Detection result is None"
 
diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index 573541ac9..bf0d87ab1 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-0
+4
\ No newline at end of file

From 5d2bdbe300c6c8e52c8b88bd87a78b6eeff7d74f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 26 Sep 2023 17:35:31 +0300
Subject: [PATCH 06/68] file deleted

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 9b296ea64..87ff333ee 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -11,7 +11,7 @@ def tokenize(text: str) -> list[str] | None:
     :param text: a text
     :return: a list of lower-cased tokens without punctuation
     """
-    if text is str == False:
+    if isinstance(text, str) == False:
         return None
     else:
         list_of_tokens = ""

From 42feb0c0a60cfbf97e562276cb0b8bf7fe4788b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 28 Sep 2023 22:09:44 +0300
Subject: [PATCH 07/68] file deleted

---
 lab_1_classify_by_unigrams/main.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 87ff333ee..48d771c3c 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -13,14 +13,14 @@ def tokenize(text: str) -> list[str] | None:
     """
     if isinstance(text, str) == False:
         return None
-    else:
-        list_of_tokens = ""
-        for token in text:
-            if token.isalpha():
-                new_token = token.lower()
-                list_of_tokens += new_token
-        tokens = list(list_of_tokens)
-        return(tokens)
+
+    list_of_tokens = ""
+    for token in text:
+        if token.isalpha():
+            new_token = token.lower()
+            list_of_tokens += new_token
+    tokens = list(list_of_tokens)
+    return tokens
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """

From 658e324e003ff299695cdb1fc20dffe0831dadb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 28 Sep 2023 22:18:21 +0300
Subject: [PATCH 08/68] file deleted

---
 lab_1_classify_by_unigrams/start.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index ffb91de3d..8fffde339 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -19,8 +19,8 @@ def main() -> None:
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
         unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text)
-    result = None
-    assert result, "Detection result is None"
+    #result = None
+    #assert result, "Detection result is None"
 
 
 if __name__ == "__main__":

From 6e3a82b96178041eb0e7bcac45f37201d90a441d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 1 Oct 2023 19:16:46 +0300
Subject: [PATCH 09/68] calculated frequencies

---
 lab_1_classify_by_unigrams/main.py          | 10 ++++++++++
 lab_1_classify_by_unigrams/start.py         |  2 ++
 lab_1_classify_by_unigrams/target_score.txt |  2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 48d771c3c..a36d78581 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -28,6 +28,16 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     :param tokens: a list of tokens
     :return: a dictionary with frequencies
     """
+    if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens):
+        return None
+    list_of_tokens = ""
+    dict_of_frequencies = {}
+    for token in tokens:
+        if token not in list_of_tokens:
+            list_of_tokens += token
+        frequency = list_of_tokens.count(token)/len(tokens)
+        dict_of_frequencies[token] = frequency
+    return dict_of_frequencies
 
 
 def create_language_profile(language: str, text: str) -> dict[str, str | dict[str, float]] | None:
diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index 8fffde339..2a6c1a3ba 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -13,6 +13,8 @@ def main() -> None:
         en_text = file_to_read_en.read()
         en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text)
         print(en_tokens)
+        create_language_profile = lab_1_classify_by_unigrams.main.calculate_frequencies(en_tokens)
+        print(create_language_profile)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
         de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text)
diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index bf0d87ab1..62f945751 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-4
\ No newline at end of file
+6
\ No newline at end of file

From 58f3cd55c3a2d1959479f771dae40f90bed9578a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 1 Oct 2023 20:03:57 +0300
Subject: [PATCH 10/68] profiles created

---
 lab_1_classify_by_unigrams/main.py  | 13 ++++++++++++-
 lab_1_classify_by_unigrams/start.py |  8 ++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index a36d78581..b8c03ef43 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -11,7 +11,7 @@ def tokenize(text: str) -> list[str] | None:
     :param text: a text
     :return: a list of lower-cased tokens without punctuation
     """
-    if isinstance(text, str) == False:
+    if not isinstance(text, str):
         return None
 
     list_of_tokens = ""
@@ -20,6 +20,7 @@ def tokenize(text: str) -> list[str] | None:
             new_token = token.lower()
             list_of_tokens += new_token
     tokens = list(list_of_tokens)
+
     return tokens
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
@@ -30,6 +31,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """
     if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens):
         return None
+
     list_of_tokens = ""
     dict_of_frequencies = {}
     for token in tokens:
@@ -37,6 +39,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
             list_of_tokens += token
         frequency = list_of_tokens.count(token)/len(tokens)
         dict_of_frequencies[token] = frequency
+
     return dict_of_frequencies
 
 
@@ -47,6 +50,14 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st
     :param text: a text
     :return: a dictionary with two keys – name, freq
     """
+    if not isinstance(language, str) or not isinstance(text, str):
+        return None
+
+    tokens = tokenize(text)
+    frequency = calculate_frequencies(tokens)
+    lang_profile = {'name': language, 'freq': frequency}
+
+    return lang_profile
 
 
 def calculate_mse(predicted: list, actual: list) -> float | None:
diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index 2a6c1a3ba..2ea1ae4f7 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -13,14 +13,14 @@ def main() -> None:
         en_text = file_to_read_en.read()
         en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text)
         print(en_tokens)
-        create_language_profile = lab_1_classify_by_unigrams.main.calculate_frequencies(en_tokens)
-        print(create_language_profile)
+        language_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text)
+        print(language_profile)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
-        de_tokens = lab_1_classify_by_unigrams.main.tokenize(de_text)
+        language_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text)
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
-        unknown_tokens = lab_1_classify_by_unigrams.main.tokenize(unknown_text)
+        language_profile = lab_1_classify_by_unigrams.main.create_language_profile('unk', unknown_text)
     #result = None
     #assert result, "Detection result is None"
 

From fc862ffbaac60d6a866b18a0c2dfc9bd298d4d19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 1 Oct 2023 22:11:10 +0300
Subject: [PATCH 11/68] calculate_frequencies fixed

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index b8c03ef43..4030dd084 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -37,7 +37,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     for token in tokens:
         if token not in list_of_tokens:
             list_of_tokens += token
-        frequency = list_of_tokens.count(token)/len(tokens)
+        frequency = tokens.count(token)/len(tokens)
         dict_of_frequencies[token] = frequency
 
     return dict_of_frequencies

From b3a235928ea1ca87aa7529c9ecd99ccc12ca985b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 2 Oct 2023 23:34:58 +0300
Subject: [PATCH 12/68] mse calculated

---
 lab_1_classify_by_unigrams/main.py          | 19 +++++++++++++++++++
 lab_1_classify_by_unigrams/target_score.txt |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 4030dd084..47b97e1d4 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -67,6 +67,17 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     :param actual: a list of actual values
     :return: the score
     """
+    if (len(predicted) != len(actual) or
+        not isinstance(predicted, list) or
+        not isinstance(actual, list)):
+        return None
+
+    sum_mse = 0
+    for i in range(len(actual)):
+        sum_mse += (actual[i]-predicted[i]) ** 2
+    mse = sum_mse/len(actual)
+
+    return mse
 
 
 def compare_profiles(
@@ -79,6 +90,14 @@ def compare_profiles(
     :param profile_to_compare: a dictionary of a profile to compare the unknown profile to
     :return: the distance between the profiles
     """
+    if (not isinstance(unknown_profile, dict) or
+        not isinstance(profile_to_compare, dict)):
+        return None
+    if ('name' not in unknown_profile or
+        'freq' not in unknown_profile or
+            'name' not in profile_to_compare or
+            'freq' not in profile_to_compare):
+        return None
 
 
 def detect_language(
diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index 62f945751..301160a93 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-6
\ No newline at end of file
+8
\ No newline at end of file

From c7fdd72747146e40ca9004de65dcb338f5e63423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 2 Oct 2023 23:44:50 +0300
Subject: [PATCH 13/68] corrected conditions in compare_profiles

---
 lab_1_classify_by_unigrams/main.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 47b97e1d4..b8dc1cef5 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -93,10 +93,8 @@ def compare_profiles(
     if (not isinstance(unknown_profile, dict) or
         not isinstance(profile_to_compare, dict)):
         return None
-    if ('name' not in unknown_profile or
-        'freq' not in unknown_profile or
-            'name' not in profile_to_compare or
-            'freq' not in profile_to_compare):
+    if (('name' or 'freq') not in unknown_profile or
+        ('name' or 'freq') not in profile_to_compare):
         return None
 
 

From 4e69cd793edb4c6ee2f3847ac34fa7d986781f15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 3 Oct 2023 22:23:57 +0300
Subject: [PATCH 14/68] mentor's corrections fixed and some things changed

---
 lab_1_classify_by_unigrams/main.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index b8dc1cef5..d8011a4db 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -17,8 +17,7 @@ def tokenize(text: str) -> list[str] | None:
     list_of_tokens = ""
     for token in text:
         if token.isalpha():
-            new_token = token.lower()
-            list_of_tokens += new_token
+            list_of_tokens += token.lower()
     tokens = list(list_of_tokens)
 
     return tokens
@@ -29,7 +28,7 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     :param tokens: a list of tokens
     :return: a dictionary with frequencies
     """
-    if not isinstance(tokens, list) or not all(isinstance(letter, str) for letter in tokens):
+    if not isinstance(tokens, list) or not all(isinstance(token, str) for token in tokens):
         return None
 
     list_of_tokens = ""
@@ -53,9 +52,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st
     if not isinstance(language, str) or not isinstance(text, str):
         return None
 
-    tokens = tokenize(text)
-    frequency = calculate_frequencies(tokens)
-    lang_profile = {'name': language, 'freq': frequency}
+    lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))}
 
     return lang_profile
 
@@ -73,11 +70,10 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
         return None
 
     sum_mse = 0
-    for i in range(len(actual)):
-        sum_mse += (actual[i]-predicted[i]) ** 2
-    mse = sum_mse/len(actual)
+    for i, act_value in enumerate(actual):
+        sum_mse += (act_value-predicted[i]) ** 2
 
-    return mse
+    return sum_mse/len(actual)
 
 
 def compare_profiles(

From 573c0875695e2fa5087ed6c1013aa2be408e734b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 3 Oct 2023 22:35:50 +0300
Subject: [PATCH 15/68] trying to fix tests

---
 lab_1_classify_by_unigrams/main.py          | 7 ++-----
 lab_1_classify_by_unigrams/target_score.txt | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index d8011a4db..c66f96cff 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -18,9 +18,8 @@ def tokenize(text: str) -> list[str] | None:
     for token in text:
         if token.isalpha():
             list_of_tokens += token.lower()
-    tokens = list(list_of_tokens)
 
-    return tokens
+    return list(list_of_tokens)
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """
@@ -52,9 +51,7 @@ def create_language_profile(language: str, text: str) -> dict[str, str | dict[st
     if not isinstance(language, str) or not isinstance(text, str):
         return None
 
-    lang_profile = {'name': language, 'freq': calculate_frequencies(tokenize(text))}
-
-    return lang_profile
+    return {'name': language, 'freq': calculate_frequencies(tokenize(text))}
 
 
 def calculate_mse(predicted: list, actual: list) -> float | None:
diff --git a/lab_1_classify_by_unigrams/target_score.txt b/lab_1_classify_by_unigrams/target_score.txt
index 301160a93..62f945751 100644
--- a/lab_1_classify_by_unigrams/target_score.txt
+++ b/lab_1_classify_by_unigrams/target_score.txt
@@ -1 +1 @@
-8
\ No newline at end of file
+6
\ No newline at end of file

From 953323bee557a0f505ab1badd9aa88bc7b5dbdfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 5 Oct 2023 18:16:00 +0300
Subject: [PATCH 16/68] tokenize simplified

---
 lab_1_classify_by_unigrams/main.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index c66f96cff..369cb658b 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -14,12 +14,9 @@ def tokenize(text: str) -> list[str] | None:
     if not isinstance(text, str):
         return None
 
-    list_of_tokens = ""
-    for token in text:
-        if token.isalpha():
-            list_of_tokens += token.lower()
+    list_of_tokens = [token.lower() for token in text if token.isalpha()]
 
-    return list(list_of_tokens)
+    return list_of_tokens
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """

From 01ba88ea816547d0ceb79e96fc99de5aef1a0195 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 5 Oct 2023 18:32:48 +0300
Subject: [PATCH 17/68] added spaces

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 369cb658b..96a9ef107 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     for i, act_value in enumerate(actual):
         sum_mse += (act_value-predicted[i]) ** 2
 
-    return sum_mse/len(actual)
+    return sum_mse / len(actual)
 
 
 def compare_profiles(

From 965b5e0779525cc8e49d9682bfb06fe917d472ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 5 Oct 2023 18:35:10 +0300
Subject: [PATCH 18/68] added spaces

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 96a9ef107..369cb658b 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     for i, act_value in enumerate(actual):
         sum_mse += (act_value-predicted[i]) ** 2
 
-    return sum_mse / len(actual)
+    return sum_mse/len(actual)
 
 
 def compare_profiles(

From 74636cd10489ffe55c2d2b6fdffc5e39b31da509 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 5 Oct 2023 18:35:24 +0300
Subject: [PATCH 19/68] added spaces

---
 lab_1_classify_by_unigrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 369cb658b..96a9ef107 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -67,7 +67,7 @@ def calculate_mse(predicted: list, actual: list) -> float | None:
     for i, act_value in enumerate(actual):
         sum_mse += (act_value-predicted[i]) ** 2
 
-    return sum_mse/len(actual)
+    return sum_mse / len(actual)
 
 
 def compare_profiles(

From 5a089b29d6d24a5cc961af76f167ad29dd49901c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 6 Oct 2023 12:23:27 +0300
Subject: [PATCH 20/68] FIXED CALCULATE_FREQUENCIES

---
 lab_1_classify_by_unigrams/main.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 96a9ef107..c4184b69c 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -27,11 +27,8 @@ def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     if not isinstance(tokens, list) or not all(isinstance(token, str) for token in tokens):
         return None
 
-    list_of_tokens = ""
     dict_of_frequencies = {}
     for token in tokens:
-        if token not in list_of_tokens:
-            list_of_tokens += token
         frequency = tokens.count(token)/len(tokens)
         dict_of_frequencies[token] = frequency
 

From c049b1f78785d440846bc51da0b5ef51da55a620 Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Wed, 11 Oct 2023 11:01:19 +0300
Subject: [PATCH 21/68] checkout labs from the origin repository

---
 lab_1_classify_by_unigrams/main.py  | 3 ---
 lab_1_classify_by_unigrams/start.py | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/lab_1_classify_by_unigrams/main.py b/lab_1_classify_by_unigrams/main.py
index 20553c73b..a2d5744f9 100644
--- a/lab_1_classify_by_unigrams/main.py
+++ b/lab_1_classify_by_unigrams/main.py
@@ -17,9 +17,6 @@ def tokenize(text: str) -> list[str] | None:
 
     return [token.lower() for token in text if token.isalpha()]
 
-    list_of_tokens = [token.lower() for token in text if token.isalpha()]
-
-    return list_of_tokens
 
 def calculate_frequencies(tokens: list[str] | None) -> dict[str, float] | None:
     """
diff --git a/lab_1_classify_by_unigrams/start.py b/lab_1_classify_by_unigrams/start.py
index 70a5d5613..4a17442d0 100644
--- a/lab_1_classify_by_unigrams/start.py
+++ b/lab_1_classify_by_unigrams/start.py
@@ -12,13 +12,8 @@ def main() -> None:
     """
     with open("assets/texts/en.txt", "r", encoding="utf-8") as file_to_read_en:
         en_text = file_to_read_en.read()
-        en_tokens = lab_1_classify_by_unigrams.main.tokenize(en_text)
-        print(en_tokens)
-        language_profile = lab_1_classify_by_unigrams.main.create_language_profile('en', en_text)
-        print(language_profile)
     with open("assets/texts/de.txt", "r", encoding="utf-8") as file_to_read_de:
         de_text = file_to_read_de.read()
-        language_profile = lab_1_classify_by_unigrams.main.create_language_profile('de', de_text)
     with open("assets/texts/unknown.txt", "r", encoding="utf-8") as file_to_read_unk:
         unknown_text = file_to_read_unk.read()
 

From 16ed42b83f26e6f146b5cc6442918b76a52829ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 18 Oct 2023 15:18:55 +0300
Subject: [PATCH 22/68] code for 4

---
 lab_2_tokenize_by_bpe/main.py          | 28 +++++++++++++++++++++++++-
 lab_2_tokenize_by_bpe/start.py         |  5 +++--
 lab_2_tokenize_by_bpe/target_score.txt |  2 +-
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 620a4d645..8068fce6e 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -14,7 +14,18 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
-
+    if (not isinstance(raw_word, str) or
+            not (isinstance(start_of_word, str) or start_of_word is None) or
+            not (isinstance(end_of_word, str) or end_of_word is None)):
+        return None
+
+    if not start_of_word and not end_of_word:
+        return tuple(list(raw_word))
+    if not end_of_word:
+        return tuple([start_of_word] + list(raw_word))
+    if not start_of_word:
+        return tuple(list(raw_word) + [end_of_word])
+    return tuple([start_of_word] + list(raw_word) + [end_of_word])
 
 def collect_frequencies(
     text: str, start_of_word: str | None, end_of_word: str
@@ -26,6 +37,21 @@ def collect_frequencies(
     :param end_of_word: a token that signifies the end of word
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
+    if (not isinstance(text, str) or
+            not (isinstance(start_of_word, str) or start_of_word is None) or
+            not isinstance(end_of_word, str)):
+        return None
+
+    dict_of_freq = {}
+    words = text.split()
+    for word in words:
+        prepr_word = prepare_word(word, None, '</s>')
+        if prepr_word is None:
+            return None
+        if prepr_word not in dict_of_freq:
+            dict_of_freq[prepr_word] = words.count(word)
+
+    return dict_of_freq
 
 
 def count_tokens_pairs(
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 798e957e0..04df67fb7 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -2,6 +2,7 @@
 BPE Tokenizer starter
 """
 from pathlib import Path
+import lab_2_tokenize_by_bpe.main
 
 
 def main() -> None:
@@ -12,8 +13,8 @@ def main() -> None:
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
 
-    result = None
-    assert result, "Encoding is not working"
+    result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
+    #assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":
diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index 573541ac9..b8626c4cf 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-0
+4

From 62fc719bb4113730683d92bf5fd334d9238e465b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 25 Oct 2023 14:26:22 +0300
Subject: [PATCH 23/68] fixed mentor's comments

---
 lab_2_tokenize_by_bpe/main.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 8068fce6e..166c43495 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -14,9 +14,11 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
-    if (not isinstance(raw_word, str) or
-            not (isinstance(start_of_word, str) or start_of_word is None) or
-            not (isinstance(end_of_word, str) or end_of_word is None)):
+    if not isinstance(raw_word, str):
+        return None
+    if not isinstance(start_of_word, str) and start_of_word is not None:
+        return None
+    if not isinstance(end_of_word, str) or end_of_word is not None:
         return None
 
     if not start_of_word and not end_of_word:
@@ -44,12 +46,11 @@ def collect_frequencies(
 
     dict_of_freq = {}
     words = text.split()
-    for word in words:
-        prepr_word = prepare_word(word, None, '</s>')
+    for word in set(words):
+        prepr_word = prepare_word(word, start_of_word, end_of_word)
         if prepr_word is None:
             return None
-        if prepr_word not in dict_of_freq:
-            dict_of_freq[prepr_word] = words.count(word)
+        dict_of_freq[prepr_word] = words.count(word)
 
     return dict_of_freq
 
@@ -62,6 +63,10 @@ def count_tokens_pairs(
     :param word_frequencies: dictionary in the form of <preprocessed word: number of occurrences>
     :return: dictionary in the form of <token pair: number of occurrences>
     """
+    if not isinstance(word_frequencies, dict):
+        return None
+
+
 
 
 def merge_tokens(

From aaa23f39804e85b1f3c4947c1304d8885834e09d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 26 Oct 2023 01:20:04 +0300
Subject: [PATCH 24/68] fixed unittests

---
 lab_2_tokenize_by_bpe/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 166c43495..e3ffe07c8 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -18,7 +18,7 @@ def prepare_word(
         return None
     if not isinstance(start_of_word, str) and start_of_word is not None:
         return None
-    if not isinstance(end_of_word, str) or end_of_word is not None:
+    if not isinstance(end_of_word, str) and end_of_word is not None:
         return None
 
     if not start_of_word and not end_of_word:

From 2083fa54322b7dd96a8eb5d84b4d729e2d4b6c97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 30 Oct 2023 21:07:53 +0300
Subject: [PATCH 25/68] code for 6

---
 lab_2_tokenize_by_bpe/main.py  | 55 ++++++++++++++++++++++++++++++++++
 lab_2_tokenize_by_bpe/start.py |  7 +++--
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index e3ffe07c8..ba4d17b63 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -66,6 +66,18 @@ def count_tokens_pairs(
     if not isinstance(word_frequencies, dict):
         return None
 
+    result_dic = {}
+    for pair in word_frequencies.items():
+        word = pair[0]
+        count = pair[1]
+        for i in range(len(word) - 1):
+            token1 = word[i]
+            token2 = word[i + 1]
+            if not result_dic.get((token1, token2)):
+                result_dic[(token1, token2)] = 0
+            result_dic[(token1, token2)] += count
+
+    return result_dic
 
 
 
@@ -78,6 +90,23 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
+    if (not isinstance(word_frequencies, dict) or
+            not isinstance(pair, tuple)):
+        return None
+
+    new_word_freq = {}
+    for pairs in word_frequencies.items():
+        word = pairs[0]
+        count = pairs[1]
+        new_word = []
+        for i in range(len(word) - 1):
+            if word[i] == pair[0] and word[i + 1] == pair[1]:
+                new_word.append((pair[0] + pair[1]))
+            else:
+                new_word.append(word[i])
+        new_word_freq[tuple(new_word)] = count
+
+    return new_word_freq
 
 
 def train(
@@ -89,6 +118,32 @@ def train(
     :param num_merges: required number of new tokens
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
+    if (not isinstance(word_frequencies, dict) or
+            not isinstance(num_merges, int)):
+        return None
+
+    dict_pairs = count_tokens_pairs(word_frequencies)
+    if dict_pairs is None:
+        return None
+    num_merges = min(num_merges, len(dict_pairs))
+
+    for iteration in range(num_merges):
+        max_value = max(dict_pairs.values())
+        value_list = [key for key, value in dict_pairs.items() if value == max_value]
+
+        max_len = max(len(''.join(pair)) for pair in value_list)
+        len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
+
+        word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
+        if word_frequencies is None:
+            return None
+
+        #dict_pairs.pop(sorted(len_list)[0])
+        dict_pairs = count_tokens_pairs(word_frequencies)
+        if dict_pairs is None:
+            return None
+
+    return word_frequencies
 
 
 def get_vocabulary(
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 04df67fb7..b8fe5f04a 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -13,8 +13,11 @@ def main() -> None:
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
 
-    result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
-    #assert result, "Encoding is not working"
+    word_freq = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
+    print(lab_2_tokenize_by_bpe.main.train(word_freq, 100))
+
+    # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
+    # assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":

From fab91f409c24d075e7dfdd495c9e999f38ec1104 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 30 Oct 2023 21:17:38 +0300
Subject: [PATCH 26/68] forgot to change the score meow

---
 lab_2_tokenize_by_bpe/target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index b8626c4cf..1e8b31496 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-4
+6

From 4ff90c203165f41bdf02880184ebc412b256bd87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 30 Oct 2023 21:28:34 +0300
Subject: [PATCH 27/68] fixed style i hope

---
 lab_2_tokenize_by_bpe/start.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index b8fe5f04a..a7921e08d 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -2,7 +2,7 @@
 BPE Tokenizer starter
 """
 from pathlib import Path
-import lab_2_tokenize_by_bpe.main
+import lab_2_tokenize_by_bpe.main as main_file
 
 
 def main() -> None:
@@ -13,8 +13,8 @@ def main() -> None:
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
 
-    word_freq = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
-    print(lab_2_tokenize_by_bpe.main.train(word_freq, 100))
+    word_freq = main_file.collect_frequencies(text, None, '</s>')
+    print(main_file.train(word_freq, 100))
 
     # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
     # assert result, "Encoding is not working"

From 9ecb29a766cc40c1120fa0468aa08c07db7931b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 30 Oct 2023 21:36:02 +0300
Subject: [PATCH 28/68] fixed style i hope [2]

---
 lab_2_tokenize_by_bpe/start.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index a7921e08d..fc080fa50 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -2,6 +2,7 @@
 BPE Tokenizer starter
 """
 from pathlib import Path
+
 import lab_2_tokenize_by_bpe.main as main_file
 
 

From 870ab1b8b35431039d0fe87b31d5547a25f6a9e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 31 Oct 2023 19:42:59 +0300
Subject: [PATCH 29/68] fixed part of comments

---
 lab_2_tokenize_by_bpe/main.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index ba4d17b63..cc3a8b067 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -21,7 +21,7 @@ def prepare_word(
     if not isinstance(end_of_word, str) and end_of_word is not None:
         return None
 
-    if not start_of_word and not end_of_word:
+    if not (start_of_word and end_of_word):
         return tuple(list(raw_word))
     if not end_of_word:
         return tuple([start_of_word] + list(raw_word))
@@ -67,15 +67,13 @@ def count_tokens_pairs(
         return None
 
     result_dic = {}
-    for pair in word_frequencies.items():
-        word = pair[0]
-        count = pair[1]
+    for word, freq in word_frequencies.items():
         for i in range(len(word) - 1):
             token1 = word[i]
             token2 = word[i + 1]
             if not result_dic.get((token1, token2)):
                 result_dic[(token1, token2)] = 0
-            result_dic[(token1, token2)] += count
+            result_dic[(token1, token2)] += freq
 
     return result_dic
 
@@ -95,16 +93,14 @@ def merge_tokens(
         return None
 
     new_word_freq = {}
-    for pairs in word_frequencies.items():
-        word = pairs[0]
-        count = pairs[1]
+    for word, freq in word_frequencies.items():
         new_word = []
         for i in range(len(word) - 1):
             if word[i] == pair[0] and word[i + 1] == pair[1]:
                 new_word.append((pair[0] + pair[1]))
             else:
                 new_word.append(word[i])
-        new_word_freq[tuple(new_word)] = count
+        new_word_freq[tuple(new_word)] = freq
 
     return new_word_freq
 

From 272d46779c16ed2b4a32fb60ddc4de92113bca73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 1 Nov 2023 23:51:16 +0300
Subject: [PATCH 30/68] trying to fix all.............

---
 lab_2_tokenize_by_bpe/main.py  | 69 ++++++++++++++++++----------------
 lab_2_tokenize_by_bpe/start.py |  2 +-
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index cc3a8b067..09b8d0857 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -21,13 +21,13 @@ def prepare_word(
     if not isinstance(end_of_word, str) and end_of_word is not None:
         return None
 
-    if not (start_of_word and end_of_word):
-        return tuple(list(raw_word))
-    if not end_of_word:
-        return tuple([start_of_word] + list(raw_word))
-    if not start_of_word:
-        return tuple(list(raw_word) + [end_of_word])
-    return tuple([start_of_word] + list(raw_word) + [end_of_word])
+    tokens = []
+    if start_of_word is not None:
+        tokens.append(start_of_word)
+    tokens.extend(list(raw_word))
+    if end_of_word is not None:
+        tokens.append(end_of_word)
+    return tuple(tokens)
 
 def collect_frequencies(
     text: str, start_of_word: str | None, end_of_word: str
@@ -46,11 +46,11 @@ def collect_frequencies(
 
     dict_of_freq = {}
     words = text.split()
-    for word in set(words):
-        prepr_word = prepare_word(word, start_of_word, end_of_word)
-        if prepr_word is None:
+    prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words]
+    for word in set(prepr_words):
+        if word is None:
             return None
-        dict_of_freq[prepr_word] = words.count(word)
+        dict_of_freq[word] = prepr_words.count(word)
 
     return dict_of_freq
 
@@ -67,13 +67,13 @@ def count_tokens_pairs(
         return None
 
     result_dic = {}
-    for word, freq in word_frequencies.items():
+    for word in word_frequencies:
         for i in range(len(word) - 1):
             token1 = word[i]
             token2 = word[i + 1]
             if not result_dic.get((token1, token2)):
                 result_dic[(token1, token2)] = 0
-            result_dic[(token1, token2)] += freq
+            result_dic[(token1, token2)] += word_frequencies[word]
 
     return result_dic
 
@@ -92,15 +92,20 @@ def merge_tokens(
             not isinstance(pair, tuple)):
         return None
 
-    new_word_freq = {}
+    new_word_freq = word_frequencies.copy()
     for word, freq in word_frequencies.items():
-        new_word = []
-        for i in range(len(word) - 1):
-            if word[i] == pair[0] and word[i + 1] == pair[1]:
-                new_word.append((pair[0] + pair[1]))
-            else:
-                new_word.append(word[i])
-        new_word_freq[tuple(new_word)] = freq
+        if pair[0] and pair[1] in word:
+            new_word = []
+            for i in range(len(word) - 1):
+                if word[i] == pair[1] and word[i - 1] == pair[0]:
+                    pass
+                elif word[i] == pair[0] and word[i + 1] == pair[1]:
+                    new_word.append((pair[0] + pair[1]))
+                else:
+                    new_word.append(word[i])
+
+            value = new_word_freq.pop(word)
+            new_word_freq[tuple(new_word)] = value
 
     return new_word_freq
 
@@ -124,20 +129,20 @@ def train(
     num_merges = min(num_merges, len(dict_pairs))
 
     for iteration in range(num_merges):
-        max_value = max(dict_pairs.values())
-        value_list = [key for key, value in dict_pairs.items() if value == max_value]
+        if dict_pairs != {}:
+            max_value = max(dict_pairs.values())
+            value_list = [key for key, value in dict_pairs.items() if value == max_value]
 
-        max_len = max(len(''.join(pair)) for pair in value_list)
-        len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
+            max_len = max(len(''.join(pair)) for pair in value_list)
+            len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
 
-        word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
-        if word_frequencies is None:
-            return None
+            word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
+            if word_frequencies is None:
+                return None
 
-        #dict_pairs.pop(sorted(len_list)[0])
-        dict_pairs = count_tokens_pairs(word_frequencies)
-        if dict_pairs is None:
-            return None
+            dict_pairs = count_tokens_pairs(word_frequencies)
+            if dict_pairs is None:
+                return None
 
     return word_frequencies
 
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index fc080fa50..33299b934 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -17,7 +17,7 @@ def main() -> None:
     word_freq = main_file.collect_frequencies(text, None, '</s>')
     print(main_file.train(word_freq, 100))
 
-    # result = lab_2_tokenize_by_bpe.main.collect_frequencies(text, None, '</s>')
+    # result = main_file.train(word_freq, 100)
     # assert result, "Encoding is not working"
 
 

From 82363ad60dc6a4f238fabb2d4bddf42121983d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 2 Nov 2023 12:31:02 +0300
Subject: [PATCH 31/68] trying to fix all.............

---
 lab_2_tokenize_by_bpe/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 09b8d0857..896520443 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -93,9 +93,9 @@ def merge_tokens(
         return None
 
     new_word_freq = word_frequencies.copy()
-    for word, freq in word_frequencies.items():
-        if pair[0] and pair[1] in word:
-            new_word = []
+    for word in word_frequencies:
+        new_word = []
+        if pair[0] in word and pair[1] in word:
             for i in range(len(word) - 1):
                 if word[i] == pair[1] and word[i - 1] == pair[0]:
                     pass

From 454a136f63eeedd6527979193e7baff7281c0536 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 3 Nov 2023 00:43:45 +0300
Subject: [PATCH 32/68] fixed merge tokens

---
 lab_2_tokenize_by_bpe/main.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 896520443..bf22fffe9 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -69,11 +69,10 @@ def count_tokens_pairs(
     result_dic = {}
     for word in word_frequencies:
         for i in range(len(word) - 1):
-            token1 = word[i]
-            token2 = word[i + 1]
-            if not result_dic.get((token1, token2)):
-                result_dic[(token1, token2)] = 0
-            result_dic[(token1, token2)] += word_frequencies[word]
+            pair = word[i:i + 2]
+            if not result_dic.get(pair):
+                result_dic[pair] = 0
+            result_dic[pair] += word_frequencies[word]
 
     return result_dic
 
@@ -92,20 +91,17 @@ def merge_tokens(
             not isinstance(pair, tuple)):
         return None
 
-    new_word_freq = word_frequencies.copy()
+    new_word_freq = {}
     for word in word_frequencies:
-        new_word = []
+        new_word = list(word)
         if pair[0] in word and pair[1] in word:
             for i in range(len(word) - 1):
-                if word[i] == pair[1] and word[i - 1] == pair[0]:
-                    pass
-                elif word[i] == pair[0] and word[i + 1] == pair[1]:
-                    new_word.append((pair[0] + pair[1]))
-                else:
-                    new_word.append(word[i])
-
-            value = new_word_freq.pop(word)
-            new_word_freq[tuple(new_word)] = value
+                current_pair = tuple([word[i], word[i+1]])
+                if current_pair == pair:
+                    new_word.pop(i+1)
+                    new_word[i] = pair[0] + pair[1]
+
+        new_word_freq[tuple(new_word)] = word_frequencies[word]
 
     return new_word_freq
 

From 4dd99dad76d229796a767fb4d450f59cccf7fe65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 3 Nov 2023 00:49:28 +0300
Subject: [PATCH 33/68] Artem Mikhailovich do not worry please

---
 lab_2_tokenize_by_bpe/start.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 33299b934..77333bbf1 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -15,10 +15,9 @@ def main() -> None:
         text = text_file.read()
 
     word_freq = main_file.collect_frequencies(text, None, '</s>')
-    print(main_file.train(word_freq, 100))
 
-    # result = main_file.train(word_freq, 100)
-    # assert result, "Encoding is not working"
+    result = main_file.train(word_freq, 100)
+    assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":

From f6e49a98c5034dd6ac767a6f01ad18395621e099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 3 Nov 2023 00:55:17 +0300
Subject: [PATCH 34/68] no more bad check

---
 lab_2_tokenize_by_bpe/main.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index bf22fffe9..47d23bc8a 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -125,20 +125,19 @@ def train(
     num_merges = min(num_merges, len(dict_pairs))
 
     for iteration in range(num_merges):
-        if dict_pairs != {}:
-            max_value = max(dict_pairs.values())
-            value_list = [key for key, value in dict_pairs.items() if value == max_value]
+        max_value = max(dict_pairs.values())
+        value_list = [key for key, value in dict_pairs.items() if value == max_value]
 
-            max_len = max(len(''.join(pair)) for pair in value_list)
-            len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
+        max_len = max(len(''.join(pair)) for pair in value_list)
+        len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
 
-            word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
-            if word_frequencies is None:
-                return None
+        word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
+        if word_frequencies is None:
+            return None
 
-            dict_pairs = count_tokens_pairs(word_frequencies)
-            if dict_pairs is None:
-                return None
+        dict_pairs = count_tokens_pairs(word_frequencies)
+        if dict_pairs is None:
+            return None
 
     return word_frequencies
 

From 1cfb086ec0ade7a64d552968c7955c5c1963fa04 Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:12:59 +0300
Subject: [PATCH 35/68] checkout labs from the origin repository

---
 lab_2_tokenize_by_bpe/main.py          | 253 +++++--------------------
 lab_2_tokenize_by_bpe/start.py         |  36 +---
 lab_2_tokenize_by_bpe/target_score.txt |   2 +-
 3 files changed, 55 insertions(+), 236 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 213e455f6..47d23bc8a 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -2,8 +2,6 @@
 Lab 2
 BPE and machine translation evaluation
 """
-import json
-import math
 
 
 def prepare_word(
@@ -16,16 +14,12 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
-    if not isinstance(raw_word, str) or not (isinstance(
-            start_of_word, str) or start_of_word is None) or not (
-            isinstance(end_of_word, str) or end_of_word is None):
+    if not isinstance(raw_word, str):
+        return None
+    if not isinstance(start_of_word, str) and start_of_word is not None:
+        return None
+    if not isinstance(end_of_word, str) and end_of_word is not None:
         return None
-    list_of_tokens = list(raw_word)
-    if end_of_word:
-        list_of_tokens.append(end_of_word)
-    if start_of_word:
-        list_of_tokens.insert(0, start_of_word)
-    return tuple(list_of_tokens)
 
     tokens = []
     if start_of_word is not None:
@@ -45,20 +39,20 @@ def collect_frequencies(
     :param end_of_word: a token that signifies the end of word
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if not isinstance(text, str) or not isinstance(end_of_word, str) or not (
-            isinstance(start_of_word, str) or start_of_word is None):
+    if (not isinstance(text, str) or
+            not (isinstance(start_of_word, str) or start_of_word is None) or
+            not isinstance(end_of_word, str)):
         return None
 
-    dict_frequencies = {}
-
-    splitted_text = text.split()
-    for i in set(splitted_text):
-        word = prepare_word(i, start_of_word, end_of_word)
-        if not word:
+    dict_of_freq = {}
+    words = text.split()
+    prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words]
+    for word in set(prepr_words):
+        if word is None:
             return None
-        dict_frequencies[word] = splitted_text.count(i)
+        dict_of_freq[word] = prepr_words.count(word)
 
-    return dict_frequencies
+    return dict_of_freq
 
 
 def count_tokens_pairs(
@@ -72,16 +66,16 @@ def count_tokens_pairs(
     if not isinstance(word_frequencies, dict):
         return None
 
-    dict_with_pairs = {}
-
+    result_dic = {}
     for word in word_frequencies:
-        for index in range(len(word) - 1):
-            pair = (word[index], word[index + 1])
-            if pair not in dict_with_pairs:
-                dict_with_pairs[pair] = 0
-            dict_with_pairs[pair] += word_frequencies[word]
+        for i in range(len(word) - 1):
+            pair = word[i:i + 2]
+            if not result_dic.get(pair):
+                result_dic[pair] = 0
+            result_dic[pair] += word_frequencies[word]
+
+    return result_dic
 
-    return dict_with_pairs
 
 
 def merge_tokens(
@@ -93,24 +87,23 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple):
+    if (not isinstance(word_frequencies, dict) or
+            not isinstance(pair, tuple)):
         return None
-    dict_merged_tokens = {}
-    for i in word_frequencies:
-        list_word = list(i)
 
-        for index in range(len(list_word) - 1):
-            if (i[index], i[index + 1]) == pair:
-                list_word[index + 1] = pair[0] + pair[1]
-                list_word[index] = ''
+    new_word_freq = {}
+    for word in word_frequencies:
+        new_word = list(word)
+        if pair[0] in word and pair[1] in word:
+            for i in range(len(word) - 1):
+                current_pair = tuple([word[i], word[i+1]])
+                if current_pair == pair:
+                    new_word.pop(i+1)
+                    new_word[i] = pair[0] + pair[1]
 
-        if '' in list_word:
-            list_word.remove('')
-            dict_merged_tokens.update({tuple(list_word): word_frequencies[i]})
-        else:
-            dict_merged_tokens.update({i: word_frequencies[i]})
+        new_word_freq[tuple(new_word)] = word_frequencies[word]
 
-    return dict_merged_tokens
+    return new_word_freq
 
 
 def train(
@@ -122,31 +115,28 @@ def train(
     :param num_merges: required number of new tokens
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int):
+    if (not isinstance(word_frequencies, dict) or
+            not isinstance(num_merges, int)):
         return None
-    dict_with_pairs = count_tokens_pairs(word_frequencies)
 
-    if not dict_with_pairs:
+    dict_pairs = count_tokens_pairs(word_frequencies)
+    if dict_pairs is None:
         return None
-    merges = min(num_merges, len(dict_with_pairs))
-
-    for i in range(merges):
+    num_merges = min(num_merges, len(dict_pairs))
 
-        max_values = max(dict_with_pairs.values())
-        pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values]
+    for iteration in range(num_merges):
+        max_value = max(dict_pairs.values())
+        value_list = [key for key, value in dict_pairs.items() if value == max_value]
 
-        max_len = max(len(str(pair)) for pair in pairs_max_values)
-        pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len]
+        max_len = max(len(''.join(pair)) for pair in value_list)
+        len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
 
-        sorted_pairs = sorted(pairs_max_len)
-        word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
-
-        if not word_frequencies:
+        word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
+        if word_frequencies is None:
             return None
 
-        dict_with_pairs = count_tokens_pairs(word_frequencies)
-
-        if not dict_with_pairs:
+        dict_pairs = count_tokens_pairs(word_frequencies)
+        if dict_pairs is None:
             return None
 
     return word_frequencies
@@ -161,26 +151,6 @@ def get_vocabulary(
     :param unknown_token: a token to signify an unknown token
     :return: dictionary in the form of <token: identifier>
     """
-    if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str):
-        return None
-
-    dict_ident = {}
-    unique_tokens = set()
-
-    for tuple_tokens in word_frequencies.keys():
-        for word in tuple_tokens:
-            unique_tokens.update(tuple_tokens, word)
-
-    unique_tokens.add(unknown_token)
-    lex_sorted = sorted(unique_tokens)
-    len_sorted = sorted(lex_sorted, key=len, reverse=True)
-    index = 0
-
-    for token in len_sorted:
-        dict_ident[token] = index
-        index += 1
-
-    return dict_ident
 
 
 def decode(
@@ -193,20 +163,6 @@ def decode(
     :param end_of_word_token: an end-of-word token
     :return: decoded sequence
     """
-    if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance(
-            end_of_word_token, str) or end_of_word_token is None):
-        return None
-    decoded = ''
-    for identifier in encoded_text:
-        token_list = [key for key in vocabulary if vocabulary[key] == identifier]
-
-        for token in token_list:
-            decoded += token
-
-    if end_of_word_token:
-        decoded = decoded.replace(end_of_word_token, ' ')
-
-    return decoded
 
 
 def tokenize_word(
@@ -220,27 +176,6 @@ def tokenize_word(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance(
-            end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str):
-        return None
-
-    word_copy = ''.join(word)
-    sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
-    result = []
-
-    for key in sorted_vocabulary:
-        while key in word_copy:
-            index = word_copy.count(' ', 0, word_copy.find(key))
-            result.insert(index, vocabulary[key])
-            word_copy = word_copy.replace(key, ' ', 1)
-
-    for unk in word_copy:
-        if unk != ' ':
-            index = word_copy.find(unk)
-            word_copy = word_copy.replace(unk, ' ')
-            result.insert(index, vocabulary[unknown_token])
-
-    return result
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -249,16 +184,6 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
     :param vocab_path: path to the saved vocabulary
     :return: dictionary in the form of <token: identifier>
     """
-    if not isinstance(vocab_path, str):
-        return None
-
-    with open(vocab_path, 'r', encoding='utf-8') as f:
-        vocab = json.load(f)
-
-    if not isinstance(vocab, dict):
-        return None
-
-    return vocab
 
 
 def encode(
@@ -277,26 +202,6 @@ def encode(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
-    if not isinstance(original_text, str) or not isinstance(
-            vocabulary, dict) or not (isinstance(
-            start_of_word_token, str) or start_of_word_token is None) or not (isinstance(
-            end_of_word_token, str) or end_of_word_token is None) or not isinstance(
-            unknown_token, str):
-        return None
-
-    encoded = []
-    split_text = original_text.split()
-
-    for word in split_text:
-        prepared = prepare_word(word, start_of_word_token, end_of_word_token)
-        if not prepared:
-            return None
-        result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token)
-        if not result:
-            return None
-        encoded.extend(result)
-
-    return encoded
 
 
 def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
@@ -306,14 +211,6 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
     :param order: required number of elements in a single n-gram
     :return: sequence of n-grams
     """
-    if not isinstance(text, str) or not isinstance(order, int):
-        return None
-
-    n_grams = []
-    for index in range(len(text) + 1 - order):
-        n_grams.append(tuple(text[index: index + order]))
-
-    return n_grams
 
 
 def calculate_precision(
@@ -325,17 +222,6 @@ def calculate_precision(
     :param reference: expected sequence of n-grams
     :return: value of Precision metric
     """
-    if not isinstance(actual, list) or not isinstance(reference, list):
-        return None
-
-    unique_ngrams = set(reference)
-    matches = 0
-
-    for n_gram in unique_ngrams:
-        if n_gram in actual:
-            matches += 1
-
-    return matches / len(unique_ngrams)
 
 
 def geo_mean(precisions: list[float], max_order: int) -> float | None:
@@ -345,17 +231,6 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     :param max_order: maximum length of n-gram considered
     :return: value of geometric mean of Precision metric
     """
-    if not isinstance(precisions, list) or not isinstance(max_order, int):
-        return None
-
-    summation = float(0)
-
-    for order in range(max_order):
-        if precisions[order] < 0:
-            return 0
-        summation += math.log(precisions[order])
-
-    return math.exp(1 / max_order * summation)
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -366,31 +241,3 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
-    if not isinstance(actual, str) or not isinstance(
-            reference, str) or max_order != 3:
-        return None
-
-    actual_ngrams = []
-    reference_ngrams = []
-
-    for order in range(max_order):
-        actual_ngram = collect_ngrams(actual, order + 1)
-        reference_ngram = collect_ngrams(reference, order + 1)
-        if actual_ngram is None or reference_ngram is None:
-            return None
-        actual_ngrams.append(actual_ngram)
-        reference_ngrams.append(reference_ngram)
-
-    precisions = []
-
-    for i, j in zip(actual_ngrams, reference_ngrams):
-        precision = calculate_precision(i, j)
-        if precision is None:
-            return None
-        precisions.append(precision)
-
-    average = geo_mean(precisions, max_order)
-    if average is None:
-        return None
-
-    return average * 100
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index d71b1c9c4..77333bbf1 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -1,11 +1,9 @@
 """
 BPE Tokenizer starter
 """
-import json
 from pathlib import Path
 
-from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode,
-                                        get_vocabulary, train)
+import lab_2_tokenize_by_bpe.main as main_file
 
 
 def main() -> None:
@@ -15,37 +13,11 @@ def main() -> None:
     assets_path = Path(__file__).parent / 'assets'
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
-    with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file:
-        encoded_secret = text_file.read()
-    dict_frequencies = collect_frequencies(text, None, '</s>')
-    merged_tokens = train(dict_frequencies, 100)
-    if merged_tokens:
-        vocabulary = get_vocabulary(merged_tokens, '<unk>')
-        secret = [int(num) for num in encoded_secret.split()]
-        result = decode(secret, vocabulary, '</s>')
-        print(result)
-        assert result, "Encoding is not working"
 
-    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
-        predicted = file.read()
-    with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file:
-        vocabulary = json.load(file)
-    with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file:
-        actual = file.read()
+    word_freq = main_file.collect_frequencies(text, None, '</s>')
 
-    if [int(token) for token in actual.split()] == encode(
-            predicted, vocabulary, '\u2581', None, '<unk>'):
-        print("Encoding is successful!")
-
-    with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
-        encoded_en = file.read()
-    with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file:
-        decoded_en = file.read()
-
-    decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None)
-    decoded = decoded.replace('\u2581', ' ')
-
-    print(calculate_bleu(decoded, decoded_en))
+    result = main_file.train(word_freq, 100)
+    assert result, "Encoding is not working"
 
 
 if __name__ == "__main__":
diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index f599e28b8..1e8b31496 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-10
+6

From accf69ee5a029fa274ebb3412e2c982f585b26ca Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:23:12 +0300
Subject: [PATCH 36/68] checkout labs from the origin repository

---
 lab_2_tokenize_by_bpe/main.py          | 260 +++++++++++++++++++------
 lab_2_tokenize_by_bpe/start.py         |  36 +++-
 lab_2_tokenize_by_bpe/target_score.txt |   2 +-
 3 files changed, 236 insertions(+), 62 deletions(-)

diff --git a/lab_2_tokenize_by_bpe/main.py b/lab_2_tokenize_by_bpe/main.py
index 47d23bc8a..19a72913f 100644
--- a/lab_2_tokenize_by_bpe/main.py
+++ b/lab_2_tokenize_by_bpe/main.py
@@ -2,6 +2,8 @@
 Lab 2
 BPE and machine translation evaluation
 """
+import json
+import math
 
 
 def prepare_word(
@@ -14,20 +16,17 @@ def prepare_word(
     :param end_of_word: a token that signifies the end of word
     :return: preprocessed word
     """
-    if not isinstance(raw_word, str):
-        return None
-    if not isinstance(start_of_word, str) and start_of_word is not None:
-        return None
-    if not isinstance(end_of_word, str) and end_of_word is not None:
+    if not isinstance(raw_word, str) or not (isinstance(
+            start_of_word, str) or start_of_word is None) or not (
+            isinstance(end_of_word, str) or end_of_word is None):
         return None
+    list_of_tokens = list(raw_word)
+    if end_of_word:
+        list_of_tokens.append(end_of_word)
+    if start_of_word:
+        list_of_tokens.insert(0, start_of_word)
+    return tuple(list_of_tokens)
 
-    tokens = []
-    if start_of_word is not None:
-        tokens.append(start_of_word)
-    tokens.extend(list(raw_word))
-    if end_of_word is not None:
-        tokens.append(end_of_word)
-    return tuple(tokens)
 
 def collect_frequencies(
     text: str, start_of_word: str | None, end_of_word: str
@@ -39,20 +38,20 @@ def collect_frequencies(
     :param end_of_word: a token that signifies the end of word
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if (not isinstance(text, str) or
-            not (isinstance(start_of_word, str) or start_of_word is None) or
-            not isinstance(end_of_word, str)):
+    if not isinstance(text, str) or not isinstance(end_of_word, str) or not (
+            isinstance(start_of_word, str) or start_of_word is None):
         return None
 
-    dict_of_freq = {}
-    words = text.split()
-    prepr_words = [prepare_word(word, start_of_word, end_of_word) for word in words]
-    for word in set(prepr_words):
-        if word is None:
+    dict_frequencies = {}
+
+    splitted_text = text.split()
+    for i in set(splitted_text):
+        word = prepare_word(i, start_of_word, end_of_word)
+        if not word:
             return None
-        dict_of_freq[word] = prepr_words.count(word)
+        dict_frequencies[word] = splitted_text.count(i)
 
-    return dict_of_freq
+    return dict_frequencies
 
 
 def count_tokens_pairs(
@@ -66,16 +65,16 @@ def count_tokens_pairs(
     if not isinstance(word_frequencies, dict):
         return None
 
-    result_dic = {}
-    for word in word_frequencies:
-        for i in range(len(word) - 1):
-            pair = word[i:i + 2]
-            if not result_dic.get(pair):
-                result_dic[pair] = 0
-            result_dic[pair] += word_frequencies[word]
+    dict_with_pairs = {}
 
-    return result_dic
+    for word in word_frequencies:
+        for index in range(len(word) - 1):
+            pair = (word[index], word[index + 1])
+            if pair not in dict_with_pairs:
+                dict_with_pairs[pair] = 0
+            dict_with_pairs[pair] += word_frequencies[word]
 
+    return dict_with_pairs
 
 
 def merge_tokens(
@@ -87,23 +86,24 @@ def merge_tokens(
     :param pair: a pair of tokens to be merged
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if (not isinstance(word_frequencies, dict) or
-            not isinstance(pair, tuple)):
+    if not isinstance(word_frequencies, dict) or not isinstance(pair, tuple):
         return None
+    dict_merged_tokens = {}
+    for i in word_frequencies:
+        list_word = list(i)
 
-    new_word_freq = {}
-    for word in word_frequencies:
-        new_word = list(word)
-        if pair[0] in word and pair[1] in word:
-            for i in range(len(word) - 1):
-                current_pair = tuple([word[i], word[i+1]])
-                if current_pair == pair:
-                    new_word.pop(i+1)
-                    new_word[i] = pair[0] + pair[1]
+        for index in range(len(list_word) - 1):
+            if (i[index], i[index + 1]) == pair:
+                list_word[index + 1] = pair[0] + pair[1]
+                list_word[index] = ''
 
-        new_word_freq[tuple(new_word)] = word_frequencies[word]
+        if '' in list_word:
+            list_word.remove('')
+            dict_merged_tokens.update({tuple(list_word): word_frequencies[i]})
+        else:
+            dict_merged_tokens.update({i: word_frequencies[i]})
 
-    return new_word_freq
+    return dict_merged_tokens
 
 
 def train(
@@ -115,28 +115,31 @@ def train(
     :param num_merges: required number of new tokens
     :return: dictionary in the form of <preprocessed word: number of occurrences>
     """
-    if (not isinstance(word_frequencies, dict) or
-            not isinstance(num_merges, int)):
+    if not isinstance(word_frequencies, dict) or not isinstance(num_merges, int):
         return None
+    dict_with_pairs = count_tokens_pairs(word_frequencies)
 
-    dict_pairs = count_tokens_pairs(word_frequencies)
-    if dict_pairs is None:
+    if not dict_with_pairs:
         return None
-    num_merges = min(num_merges, len(dict_pairs))
+    merges = min(num_merges, len(dict_with_pairs))
+
+    for i in range(merges):
 
-    for iteration in range(num_merges):
-        max_value = max(dict_pairs.values())
-        value_list = [key for key, value in dict_pairs.items() if value == max_value]
+        max_values = max(dict_with_pairs.values())
+        pairs_max_values = [i for i in dict_with_pairs if dict_with_pairs[i] == max_values]
 
-        max_len = max(len(''.join(pair)) for pair in value_list)
-        len_list = [pair for pair in value_list if len(''.join(pair)) == max_len]
+        max_len = max(len(str(pair)) for pair in pairs_max_values)
+        pairs_max_len = [i for i in pairs_max_values if len(str(i)) == max_len]
 
-        word_frequencies = merge_tokens(word_frequencies, sorted(len_list)[0])
-        if word_frequencies is None:
+        sorted_pairs = sorted(pairs_max_len)
+        word_frequencies = merge_tokens(word_frequencies, sorted_pairs[0])
+
+        if not word_frequencies:
             return None
 
-        dict_pairs = count_tokens_pairs(word_frequencies)
-        if dict_pairs is None:
+        dict_with_pairs = count_tokens_pairs(word_frequencies)
+
+        if not dict_with_pairs:
             return None
 
     return word_frequencies
@@ -151,6 +154,26 @@ def get_vocabulary(
     :param unknown_token: a token to signify an unknown token
     :return: dictionary in the form of <token: identifier>
     """
+    if not isinstance(word_frequencies, dict) or not isinstance(unknown_token, str):
+        return None
+
+    dict_ident = {}
+    unique_tokens = set()
+
+    for tuple_tokens in word_frequencies.keys():
+        for word in tuple_tokens:
+            unique_tokens.update(tuple_tokens, word)
+
+    unique_tokens.add(unknown_token)
+    lex_sorted = sorted(unique_tokens)
+    len_sorted = sorted(lex_sorted, key=len, reverse=True)
+    index = 0
+
+    for token in len_sorted:
+        dict_ident[token] = index
+        index += 1
+
+    return dict_ident
 
 
 def decode(
@@ -163,6 +186,20 @@ def decode(
     :param end_of_word_token: an end-of-word token
     :return: decoded sequence
     """
+    if not isinstance(encoded_text, list) or not isinstance(vocabulary, dict) or not (isinstance(
+            end_of_word_token, str) or end_of_word_token is None):
+        return None
+    decoded = ''
+    for identifier in encoded_text:
+        token_list = [key for key in vocabulary if vocabulary[key] == identifier]
+
+        for token in token_list:
+            decoded += token
+
+    if end_of_word_token:
+        decoded = decoded.replace(end_of_word_token, ' ')
+
+    return decoded
 
 
 def tokenize_word(
@@ -176,6 +213,27 @@ def tokenize_word(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
+    if not isinstance(word, tuple) or not isinstance(vocabulary, dict) or not (isinstance(
+            end_of_word, str) or end_of_word is None) or not isinstance(unknown_token, str):
+        return None
+
+    word_copy = ''.join(word)
+    sorted_vocabulary = sorted(list(vocabulary.keys()), key=lambda x: (-len(x), x))
+    result = []
+
+    for key in sorted_vocabulary:
+        while key in word_copy:
+            index = word_copy.count(' ', 0, word_copy.find(key))
+            result.insert(index, vocabulary[key])
+            word_copy = word_copy.replace(key, ' ', 1)
+
+    for unk in word_copy:
+        if unk != ' ':
+            index = word_copy.find(unk)
+            word_copy = word_copy.replace(unk, ' ')
+            result.insert(index, vocabulary[unknown_token])
+
+    return result
 
 
 def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
@@ -184,6 +242,16 @@ def load_vocabulary(vocab_path: str) -> dict[str, int] | None:
     :param vocab_path: path to the saved vocabulary
     :return: dictionary in the form of <token: identifier>
     """
+    if not isinstance(vocab_path, str):
+        return None
+
+    with open(vocab_path, 'r', encoding='utf-8') as f:
+        vocab = json.load(f)
+
+    if not isinstance(vocab, dict):
+        return None
+
+    return vocab
 
 
 def encode(
@@ -202,6 +270,26 @@ def encode(
     :param unknown_token: token that signifies unknown sequence
     :return: list of token identifiers
     """
+    if not isinstance(original_text, str) or not isinstance(
+            vocabulary, dict) or not (isinstance(
+            start_of_word_token, str) or start_of_word_token is None) or not (isinstance(
+            end_of_word_token, str) or end_of_word_token is None) or not isinstance(
+            unknown_token, str):
+        return None
+
+    encoded = []
+    split_text = original_text.split()
+
+    for word in split_text:
+        prepared = prepare_word(word, start_of_word_token, end_of_word_token)
+        if not prepared:
+            return None
+        result = tokenize_word(prepared, vocabulary, end_of_word_token, unknown_token)
+        if not result:
+            return None
+        encoded.extend(result)
+
+    return encoded
 
 
 def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
@@ -211,6 +299,14 @@ def collect_ngrams(text: str, order: int) -> list[tuple[str, ...]] | None:
     :param order: required number of elements in a single n-gram
     :return: sequence of n-grams
     """
+    if not isinstance(text, str) or not isinstance(order, int):
+        return None
+
+    n_grams = []
+    for index in range(len(text) + 1 - order):
+        n_grams.append(tuple(text[index: index + order]))
+
+    return n_grams
 
 
 def calculate_precision(
@@ -222,6 +318,17 @@ def calculate_precision(
     :param reference: expected sequence of n-grams
     :return: value of Precision metric
     """
+    if not isinstance(actual, list) or not isinstance(reference, list):
+        return None
+
+    unique_ngrams = set(reference)
+    matches = 0
+
+    for n_gram in unique_ngrams:
+        if n_gram in actual:
+            matches += 1
+
+    return matches / len(unique_ngrams)
 
 
 def geo_mean(precisions: list[float], max_order: int) -> float | None:
@@ -231,6 +338,17 @@ def geo_mean(precisions: list[float], max_order: int) -> float | None:
     :param max_order: maximum length of n-gram considered
     :return: value of geometric mean of Precision metric
     """
+    if not isinstance(precisions, list) or not isinstance(max_order, int):
+        return None
+
+    summation = float(0)
+
+    for order in range(max_order):
+        if precisions[order] < 0:
+            return 0
+        summation += math.log(precisions[order])
+
+    return math.exp(1 / max_order * summation)
 
 
 def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> float | None:
@@ -241,3 +359,31 @@ def calculate_bleu(actual: str | None, reference: str, max_order: int = 3) -> fl
     :param max_order: max length of n-gram to consider for comparison
     :return: value of BLEU metric
     """
+    if not isinstance(actual, str) or not isinstance(
+            reference, str) or max_order != 3:
+        return None
+
+    actual_ngrams = []
+    reference_ngrams = []
+
+    for order in range(max_order):
+        actual_ngram = collect_ngrams(actual, order + 1)
+        reference_ngram = collect_ngrams(reference, order + 1)
+        if actual_ngram is None or reference_ngram is None:
+            return None
+        actual_ngrams.append(actual_ngram)
+        reference_ngrams.append(reference_ngram)
+
+    precisions = []
+
+    for i, j in zip(actual_ngrams, reference_ngrams):
+        precision = calculate_precision(i, j)
+        if precision is None:
+            return None
+        precisions.append(precision)
+
+    average = geo_mean(precisions, max_order)
+    if average is None:
+        return None
+
+    return average * 100
diff --git a/lab_2_tokenize_by_bpe/start.py b/lab_2_tokenize_by_bpe/start.py
index 77333bbf1..d71b1c9c4 100644
--- a/lab_2_tokenize_by_bpe/start.py
+++ b/lab_2_tokenize_by_bpe/start.py
@@ -1,9 +1,11 @@
 """
 BPE Tokenizer starter
 """
+import json
 from pathlib import Path
 
-import lab_2_tokenize_by_bpe.main as main_file
+from lab_2_tokenize_by_bpe.main import (calculate_bleu, collect_frequencies, decode, encode,
+                                        get_vocabulary, train)
 
 
 def main() -> None:
@@ -13,11 +15,37 @@ def main() -> None:
     assets_path = Path(__file__).parent / 'assets'
     with open(assets_path / 'text.txt', 'r', encoding='utf-8') as text_file:
         text = text_file.read()
+    with open(assets_path / 'secrets/secret_2.txt', 'r', encoding='utf-8') as text_file:
+        encoded_secret = text_file.read()
+    dict_frequencies = collect_frequencies(text, None, '</s>')
+    merged_tokens = train(dict_frequencies, 100)
+    if merged_tokens:
+        vocabulary = get_vocabulary(merged_tokens, '<unk>')
+        secret = [int(num) for num in encoded_secret.split()]
+        result = decode(secret, vocabulary, '</s>')
+        print(result)
+        assert result, "Encoding is not working"
 
-    word_freq = main_file.collect_frequencies(text, None, '</s>')
+    with open(assets_path / 'for_translation_ru_raw.txt', 'r', encoding='utf-8') as file:
+        predicted = file.read()
+    with open(assets_path / 'vocab.json', 'r', encoding='utf-8') as file:
+        vocabulary = json.load(file)
+    with open(assets_path / 'for_translation_ru_encoded.txt', 'r', encoding='utf-8') as file:
+        actual = file.read()
 
-    result = main_file.train(word_freq, 100)
-    assert result, "Encoding is not working"
+    if [int(token) for token in actual.split()] == encode(
+            predicted, vocabulary, '\u2581', None, '<unk>'):
+        print("Encoding is successful!")
+
+    with open(assets_path / 'for_translation_en_encoded.txt', 'r', encoding='utf-8') as file:
+        encoded_en = file.read()
+    with open(assets_path / 'for_translation_en_raw.txt', 'r', encoding='utf-8') as file:
+        decoded_en = file.read()
+
+    decoded = decode([int(num) for num in encoded_en.split()], vocabulary, None)
+    decoded = decoded.replace('\u2581', ' ')
+
+    print(calculate_bleu(decoded, decoded_en))
 
 
 if __name__ == "__main__":
diff --git a/lab_2_tokenize_by_bpe/target_score.txt b/lab_2_tokenize_by_bpe/target_score.txt
index 1e8b31496..f599e28b8 100644
--- a/lab_2_tokenize_by_bpe/target_score.txt
+++ b/lab_2_tokenize_by_bpe/target_score.txt
@@ -1 +1 @@
-6
+10

From 5cd71b6e404f0e679c4f5214bc83b57e0c228488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 15 Nov 2023 21:16:07 +0300
Subject: [PATCH 37/68] code for 4

---
 lab_3_generate_by_ngrams/main.py          | 96 +++++++++++++++++++++++
 lab_3_generate_by_ngrams/start.py         |  7 +-
 lab_3_generate_by_ngrams/target_score.txt |  2 +-
 3 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index dcf4e8af9..e909e7039 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -23,6 +23,8 @@ def __init__(self, end_of_word_token: str) -> None:
         Args:
             end_of_word_token (str): A token denoting word boundary
         """
+        self._end_of_word_token =  end_of_word_token
+        self._storage = {self._end_of_word_token: 0}
 
     def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         """
@@ -41,6 +43,22 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(text, str) or len(text) == 0:
+            return None
+
+        tokenized_text = []
+        for element in text.lower():
+            if element.isalpha():
+                tokenized_text.append(element)
+            elif element.isspace() and tokenized_text[-1] != self._end_of_word_token:
+                tokenized_text.append(self._end_of_word_token)
+        if not tokenized_text[-1].isalnum():
+            tokenized_text.append(self._end_of_word_token)
+
+        if len(tokenized_text) == 0:
+            return None
+
+        return tuple(tokenized_text)
 
     def get_id(self, element: str) -> Optional[int]:
         """
@@ -55,6 +73,10 @@ def get_id(self, element: str) -> Optional[int]:
         In case of corrupt input arguments or arguments not included in storage,
         None is returned
         """
+        if not isinstance(element, str) or element not in self._storage:
+            return None
+
+        return self._storage[element]
 
     def get_end_of_word_token(self) -> str:
         """
@@ -63,6 +85,7 @@ def get_end_of_word_token(self) -> str:
         Returns:
             str: EoW token
         """
+        return self._end_of_word_token
 
     def get_token(self, element_id: int) -> Optional[str]:
         """
@@ -76,6 +99,12 @@ def get_token(self, element_id: int) -> Optional[str]:
 
         In case of corrupt input arguments or arguments not included in storage, None is returned
         """
+        if not isinstance(element_id, str) or element_id not in self._storage.values():
+            return None
+
+        for token, ident in self._storage.items():
+            if element_id == ident:
+                return token
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """
@@ -93,6 +122,26 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(text, str) or len(text) == 0:
+            return None
+
+        tokenized_text = self._tokenize(text)
+        if tokenized_text is None:
+            return None
+
+        for token in tokenized_text:
+            self._put(token)
+            if self._put(token) is None:
+                return None
+
+        encoded_corpus = []
+        for token in tokenized_text:
+            if self.get_id(token) is None:
+                return None
+            else:
+                encoded_corpus.append(self.get_id(token))
+
+        return tuple(encoded_corpus)
 
     def _put(self, element: str) -> None:
         """
@@ -104,6 +153,14 @@ def _put(self, element: str) -> None:
         In case of corrupt input arguments or invalid argument length,
         an element is not added to storage
         """
+        if not isinstance(element, str) or len(element) != 1:
+            return None
+
+        if element not in self._storage:
+            self._storage[element] = len(self._storage)
+
+        return None
+
 
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
@@ -121,6 +178,18 @@ def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0:
+            return None
+
+        decoded_corpus = self._decode(encoded_corpus)
+        if decoded_corpus is None:
+            return None
+
+        resulting_text = self._postprocess_decoded_text(decoded_corpus)
+        if resulting_text is None:
+            return None
+
+        return resulting_text
 
     def fill_from_ngrams(self, content: dict) -> None:
         """
@@ -143,6 +212,18 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
         In case of corrupt input arguments, None is returned.
         In case any of methods used return None, None is returned.
         """
+        if not isinstance(corpus, tuple) or len(corpus) == 0:
+            return None
+
+        decoded_corpus = []
+        for ident in corpus:
+            if not isinstance(ident, int):
+                return None
+            if self.get_token(ident) is None:
+                return None
+            decoded_corpus.append(self.get_token(ident))
+
+        return tuple(decoded_corpus)
 
     def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional[str]:
         """
@@ -159,6 +240,21 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
 
         In case of corrupt input arguments, None is returned
         """
+        if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
+            return None
+
+        resulting_text = ""
+        for token in decoded_corpus:
+            if decoded_corpus[0]:
+                resulting_text += token.upper()
+            elif token == self._end_of_word_token:
+                resulting_text += " "
+            else:
+                resulting_text += token
+        resulting_text.replace(resulting_text[-1], ".")
+
+        return resulting_text
+
 
 
 class NGramLanguageModel:
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index b9bcbd999..d51d8fb59 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -2,6 +2,8 @@
 Generation by NGrams starter
 """
 
+import lab_3_generate_by_ngrams.main as main_py
+
 
 def main() -> None:
     """
@@ -11,7 +13,10 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    result = None
+        text_processor = main_py.TextProcessor('_')
+        encoded_text = text_processor.encode(text)
+        decoded_text = text_processor.decode(encoded_text)
+    result = decoded_text
     assert result
 
 
diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index 573541ac9..b8626c4cf 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-0
+4

From 79ff708844074417429ec0f747b7a7e04e01857c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sat, 18 Nov 2023 18:35:05 +0300
Subject: [PATCH 38/68] fixed tests

---
 lab_3_generate_by_ngrams/main.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index e909e7039..6e9cdadbe 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -23,7 +23,7 @@ def __init__(self, end_of_word_token: str) -> None:
         Args:
             end_of_word_token (str): A token denoting word boundary
         """
-        self._end_of_word_token =  end_of_word_token
+        self._end_of_word_token = end_of_word_token
         self._storage = {self._end_of_word_token: 0}
 
     def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
@@ -52,7 +52,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
                 tokenized_text.append(element)
             elif element.isspace() and tokenized_text[-1] != self._end_of_word_token:
                 tokenized_text.append(self._end_of_word_token)
-        if not tokenized_text[-1].isalnum():
+        if not text[-1].isalnum():
             tokenized_text.append(self._end_of_word_token)
 
         if len(tokenized_text) == 0:
@@ -99,7 +99,7 @@ def get_token(self, element_id: int) -> Optional[str]:
 
         In case of corrupt input arguments or arguments not included in storage, None is returned
         """
-        if not isinstance(element_id, str) or element_id not in self._storage.values():
+        if not isinstance(element_id, int) or element_id not in self._storage.values():
             return None
 
         for token, ident in self._storage.items():
@@ -131,8 +131,6 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]:
 
         for token in tokenized_text:
             self._put(token)
-            if self._put(token) is None:
-                return None
 
         encoded_corpus = []
         for token in tokenized_text:
@@ -244,14 +242,17 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
             return None
 
         resulting_text = ""
-        for token in decoded_corpus:
-            if decoded_corpus[0]:
+        for index, token in enumerate(decoded_corpus):
+            if index == 0:
                 resulting_text += token.upper()
             elif token == self._end_of_word_token:
-                resulting_text += " "
+                if index == len(decoded_corpus) - 1:
+                    resulting_text += "."
+                else:
+                    resulting_text += " "
             else:
                 resulting_text += token
-        resulting_text.replace(resulting_text[-1], ".")
+        # resulting_text.replace(resulting_text[-1], ".")
 
         return resulting_text
 

From 7908ca505032eb79d3c8ca7ad74aaf712f0cbbc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sat, 18 Nov 2023 21:24:21 +0300
Subject: [PATCH 39/68] code dor 6

---
 lab_3_generate_by_ngrams/main.py          | 71 +++++++++++++++++++++++
 lab_3_generate_by_ngrams/start.py         |  5 +-
 lab_3_generate_by_ngrams/target_score.txt |  2 +-
 3 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 6e9cdadbe..b7b5e99e6 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -276,6 +276,9 @@ def __init__(self, encoded_corpus: tuple | None, n_gram_size: int) -> None:
             encoded_corpus (tuple): Encoded text
             n_gram_size (int): A size of n-grams to use for language modelling
         """
+        self._encoded_corpus = encoded_corpus
+        self._n_gram_size = n_gram_size
+        self._n_gram_frequencies = {}
 
     def get_n_gram_size(self) -> int:
         """
@@ -284,6 +287,7 @@ def get_n_gram_size(self) -> int:
         Returns:
             int: Size of stored n_grams
         """
+        return self._n_gram_size
 
     def set_n_grams(self, frequencies: dict) -> None:
         """
@@ -305,6 +309,21 @@ def build(self) -> int:
         In case of corrupt input arguments or methods used return None,
         1 is returned
         """
+        if not isinstance(self._encoded_corpus, tuple) or len(self._encoded_corpus) == 0:
+            return 1
+
+        n_grams = self._extract_n_grams(self._encoded_corpus)
+        if not isinstance(n_grams, tuple) or n_grams is None:
+            return 1
+
+        for ngram in set(n_grams):
+            if not isinstance(ngram, tuple):
+                return 1
+            p_w_1_2 = n_grams.count(ngram)
+            p_w_1 = len([context for context in n_grams if context[:-1] == ngram[:-1]])
+            self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1
+
+        return 0
 
     def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
         """
@@ -318,6 +337,17 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
 
         In case of corrupt input arguments, None is returned
         """
+        if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1:
+            return None
+
+        possible_tokens = {}
+
+        context = sequence[-(self._n_gram_size - 1)::]
+        for ngram in self._n_gram_frequencies:
+            if ngram[:self._n_gram_size - 1] == context:
+                possible_tokens[ngram[-1]] = self._n_gram_frequencies[ngram]
+
+        return possible_tokens
 
     def _extract_n_grams(
         self, encoded_corpus: tuple[int, ...]
@@ -333,6 +363,15 @@ def _extract_n_grams(
 
         In case of corrupt input arguments, None is returned
         """
+        if not isinstance(encoded_corpus, tuple) or len(encoded_corpus) == 0:
+            return None
+
+        n_grams = []
+        for i in range(len(encoded_corpus) - self._n_gram_size + 1):
+            ngram = tuple(encoded_corpus[i: i + self._n_gram_size])
+            n_grams.append(ngram)
+
+        return tuple(n_grams)
 
 
 class GreedyTextGenerator:
@@ -352,6 +391,8 @@ def __init__(self, language_model: NGramLanguageModel, text_processor: TextProce
             language_model (NGramLanguageModel): A language model to use for text generation
             text_processor (TextProcessor): A TextProcessor instance to handle text processing
         """
+        self._model = language_model
+        self._text_processor = text_processor
 
     def run(self, seq_len: int, prompt: str) -> Optional[str]:
         """
@@ -367,6 +408,36 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
         In case of corrupt input arguments or methods used return None,
         None is returned
         """
+        if not isinstance(seq_len, int) or not isinstance(prompt, str) or len(prompt) == 0:
+            return None
+
+        encoded_prompt = self._text_processor.encode(prompt)
+        if encoded_prompt is None:
+            return None
+        ngram_size = self._model.get_n_gram_size()
+
+        text = prompt
+
+        for i in range(seq_len):
+            tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:])
+            if tokens is None:
+                break
+            max_freq = max(tokens.values())
+            max_candidates = []
+            for candidate, freq in tokens.items():
+                if freq == max_freq:
+                    max_candidates.append(candidate)
+            encoded_prompt = encoded_prompt + (sorted(max_candidates)[0])
+            best_candidate =self._text_processor.get_token(encoded_prompt[-1])
+            if best_candidate is None:
+                return None
+            text += best_candidate
+
+        decoded_prompt = self._text_processor.decode(encoded_prompt)
+        if decoded_prompt is None:
+            return None
+
+        return decoded_prompt
 
 
 class BeamSearcher:
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index d51d8fb59..7c0f5dedd 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -16,7 +16,10 @@ def main() -> None:
         text_processor = main_py.TextProcessor('_')
         encoded_text = text_processor.encode(text)
         decoded_text = text_processor.decode(encoded_text)
-    result = decoded_text
+        language_model = main_py.NGramLanguageModel(encoded_text, 7)
+        greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
+        generated_text = greedy_generator.run(51, 'Vernon')
+    result = generated_text
     assert result
 
 
diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index b8626c4cf..1e8b31496 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-4
+6

From f47f868cbea42392c510e0920c915917b75ad5b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 19 Nov 2023 18:37:25 +0300
Subject: [PATCH 40/68] trying to fix and fix and fix.......

---
 lab_3_generate_by_ngrams/main.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index b7b5e99e6..400b81136 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -421,6 +421,8 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
         for i in range(seq_len):
             tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:])
             if tokens is None:
+                return prompt + "."
+            if len(tokens) == 0:
                 break
             max_freq = max(tokens.values())
             max_candidates = []
@@ -428,12 +430,12 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
                 if freq == max_freq:
                     max_candidates.append(candidate)
             encoded_prompt = encoded_prompt + (sorted(max_candidates)[0])
-            best_candidate =self._text_processor.get_token(encoded_prompt[-1])
+            best_candidate = self._text_processor.get_token(encoded_prompt[-1])
             if best_candidate is None:
                 return None
             text += best_candidate
 
-        decoded_prompt = self._text_processor.decode(encoded_prompt)
+        decoded_prompt = self._text_processor.decode(encoded_prompt) + "."
         if decoded_prompt is None:
             return None
 

From aded7e1926f93da605d0e89e3b1ed786b108a383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 21 Nov 2023 22:53:25 +0300
Subject: [PATCH 41/68] fixed comments and tests (except filter im trying to
 deal with it.....)

---
 lab_3_generate_by_ngrams/main.py  | 26 ++++++++++++--------------
 lab_3_generate_by_ngrams/start.py | 14 ++++++++------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 400b81136..8d6a22f2b 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -136,8 +136,7 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]:
         for token in tokenized_text:
             if self.get_id(token) is None:
                 return None
-            else:
-                encoded_corpus.append(self.get_id(token))
+            encoded_corpus.append(self.get_id(token))
 
         return tuple(encoded_corpus)
 
@@ -151,11 +150,10 @@ def _put(self, element: str) -> None:
         In case of corrupt input arguments or invalid argument length,
         an element is not added to storage
         """
-        if not isinstance(element, str) or len(element) != 1:
+        if not isinstance(element, str) or len(element) != 1 or element in self._storage:
             return None
 
-        if element not in self._storage:
-            self._storage[element] = len(self._storage)
+        self._storage[element] = len(self._storage)
 
         return None
 
@@ -217,9 +215,10 @@ def _decode(self, corpus: tuple[int, ...]) -> Optional[tuple[str, ...]]:
         for ident in corpus:
             if not isinstance(ident, int):
                 return None
-            if self.get_token(ident) is None:
+            new_token = self.get_token(ident)
+            if new_token is None:
                 return None
-            decoded_corpus.append(self.get_token(ident))
+            decoded_corpus.append(new_token)
 
         return tuple(decoded_corpus)
 
@@ -247,12 +246,11 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
                 resulting_text += token.upper()
             elif token == self._end_of_word_token:
                 if index == len(decoded_corpus) - 1:
-                    resulting_text += "."
+                    resulting_text = f"{resulting_text}."
                 else:
-                    resulting_text += " "
+                    resulting_text = f"{resulting_text} "
             else:
                 resulting_text += token
-        # resulting_text.replace(resulting_text[-1], ".")
 
         return resulting_text
 
@@ -320,7 +318,7 @@ def build(self) -> int:
             if not isinstance(ngram, tuple):
                 return 1
             p_w_1_2 = n_grams.count(ngram)
-            p_w_1 = len([context for context in n_grams if context[:-1] == ngram[:-1]])
+            p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1])
             self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1
 
         return 0
@@ -421,7 +419,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
         for i in range(seq_len):
             tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:])
             if tokens is None:
-                return prompt + "."
+                return f"{prompt}."
             if len(tokens) == 0:
                 break
             max_freq = max(tokens.values())
@@ -429,13 +427,13 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
             for candidate, freq in tokens.items():
                 if freq == max_freq:
                     max_candidates.append(candidate)
-            encoded_prompt = encoded_prompt + (sorted(max_candidates)[0])
+            encoded_prompt = encoded_prompt + (sorted(max_candidates)[0],)
             best_candidate = self._text_processor.get_token(encoded_prompt[-1])
             if best_candidate is None:
                 return None
             text += best_candidate
 
-        decoded_prompt = self._text_processor.decode(encoded_prompt) + "."
+        decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}."
         if decoded_prompt is None:
             return None
 
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index 7c0f5dedd..f7498dc7f 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -13,12 +13,14 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-        text_processor = main_py.TextProcessor('_')
-        encoded_text = text_processor.encode(text)
-        decoded_text = text_processor.decode(encoded_text)
-        language_model = main_py.NGramLanguageModel(encoded_text, 7)
-        greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
-        generated_text = greedy_generator.run(51, 'Vernon')
+    text_processor = main_py.TextProcessor('_')
+    encoded_text = text_processor.encode(text)
+    decoded_text = text_processor.decode(encoded_text)
+
+    language_model = main_py.NGramLanguageModel(encoded_text, 7)
+    greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
+    generated_text = greedy_generator.run(51, 'Vernon')
+
     result = generated_text
     assert result
 

From 455e3fdba2adcd047d7a8ff6a4d5c6b03fd172ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 21 Nov 2023 23:53:20 +0300
Subject: [PATCH 42/68] filter (finally!!) (im not sure but..)

---
 lab_3_generate_by_ngrams/main.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 8d6a22f2b..138e3886d 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -102,9 +102,11 @@ def get_token(self, element_id: int) -> Optional[str]:
         if not isinstance(element_id, int) or element_id not in self._storage.values():
             return None
 
-        for token, ident in self._storage.items():
-            if element_id == ident:
-                return token
+        items = list(filter(lambda x: x[1] == element_id, self._storage.items()))
+        return items[0][0]
+        # for token, ident in self._storage.items():
+            # if element_id == ident:
+                # return token
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """
@@ -457,6 +459,8 @@ def __init__(self, beam_width: int, language_model: NGramLanguageModel) -> None:
             beam_width (int): Number of candidates to consider at each step
             language_model (NGramLanguageModel): A language model to use for next token prediction
         """
+        self._beam_width = beam_width
+        self._model = language_model
 
     def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int, float]]]:
         """
@@ -477,6 +481,16 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
 
         In case of corrupt input arguments or methods used return None.
         """
+        if not isinstance(sequence, tuple) or len(sequence) == 0:
+            return None
+
+        generated_dict = self._model.generate_next_token(sequence)
+        if generated_dict is None:
+            return None
+        if not generated_dict:
+            return []
+
+
 
     def continue_sequence(
         self,

From ca0b2789cd0018193a175ea3f2715a8e42e42c53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Tue, 21 Nov 2023 23:53:31 +0300
Subject: [PATCH 43/68] filter (finally!!) (im not sure but..)

---
 lab_3_generate_by_ngrams/main.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 138e3886d..164f9d776 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -104,9 +104,6 @@ def get_token(self, element_id: int) -> Optional[str]:
 
         items = list(filter(lambda x: x[1] == element_id, self._storage.items()))
         return items[0][0]
-        # for token, ident in self._storage.items():
-            # if element_id == ident:
-                # return token
 
     def encode(self, text: str) -> Optional[tuple[int, ...]]:
         """

From 36ea7e86ed39bd12f5ac804b72f8a768958dadf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 18:47:00 +0300
Subject: [PATCH 44/68] code for 8

---
 lab_3_generate_by_ngrams/main.py          | 68 +++++++++++++++++++++++
 lab_3_generate_by_ngrams/start.py         |  5 +-
 lab_3_generate_by_ngrams/target_score.txt |  2 +-
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 164f9d776..b2b918038 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -5,6 +5,7 @@
 """
 # pylint:disable=too-few-public-methods
 from typing import Optional
+import math
 
 
 class TextProcessor:
@@ -487,6 +488,8 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
         if not generated_dict:
             return []
 
+        return sorted([(token, freq) for token, freq in generated_dict.items()],
+                      key=lambda pair: pair[1], reverse=True)[:self._beam_width]
 
 
     def continue_sequence(
@@ -510,6 +513,19 @@ def continue_sequence(
 
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
+        if (not isinstance(sequence,tuple) or len(sequence) == 0 or
+            not isinstance(next_tokens, list) or len(next_tokens) == 0 or
+            not isinstance(sequence_candidates, dict) or not sequence_candidates or
+            len(next_tokens) >= self._beam_width or
+            sequence not in sequence_candidates):
+            return None
+
+        result_dict_cand = sequence_candidates.copy()
+        for token, freq in next_tokens:
+            result_dict_cand[sequence + (token,)] = freq - math.log(freq)
+
+        return result_dict_cand
+
 
     def prune_sequence_candidates(
         self, sequence_candidates: dict[tuple[int, ...], float]
@@ -525,6 +541,13 @@ def prune_sequence_candidates(
 
         In case of corrupt input arguments return None.
         """
+        if not isinstance(sequence_candidates, dict) or not sequence_candidates:
+            return None
+
+        sorted_sequences = sorted(sequence_candidates.items(), key=lambda item: item[1], reverse=True)
+
+        return dict(sorted_sequences[:self._beam_width])
+
 
 
 class BeamSearchTextGenerator:
@@ -552,6 +575,10 @@ def __init__(
             text_processor (TextProcessor): A TextProcessor instance to handle text processing
             beam_width (int): Beam width parameter for generation
         """
+        self._language_model = language_model
+        self._text_processor = text_processor
+        self._beam_width = beam_width
+        self._beam_searchers = BeamSearcher(self._beam_width, self._language_model)
 
     def run(self, prompt: str, seq_len: int) -> Optional[str]:
         """
@@ -567,6 +594,39 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
         In case of corrupt input arguments or methods used return None,
         None is returned
         """
+        if (not isinstance(prompt, str) or len(prompt) == 0 or
+            not isinstance(seq_len, int) or seq_len < 0):
+            return None
+
+        encoded_prompt = self._text_processor.encode(prompt)
+        if encoded_prompt is None:
+            return None
+        candidates = {encoded_prompt: 0.0}
+
+        for i in range(seq_len):
+            new_candidates = candidates.copy()
+            for sequence in candidates:
+                next_tokens = self._get_next_token(sequence)
+                if next_tokens is None:
+                    return None
+
+                continued_sentence = (
+                    self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates)
+                )
+                if continued_sentence is None:
+                    break
+
+                best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates)
+                if best_sequence is None:
+                    return None
+                candidates = best_sequence
+
+            decoded_result = self._text_processor.decode(sorted(tuple(candidates), key=lambda item: item[1])[0])
+
+            return decoded_result
+
+
+
 
     def _get_next_token(
         self, sequence_to_continue: tuple[int, ...]
@@ -583,6 +643,14 @@ def _get_next_token(
 
         In case of corrupt input arguments return None.
         """
+        if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0:
+            return None
+
+        next_token = self._beam_searchers.get_next_token(sequence_to_continue)
+        if next_token is None:
+            return None
+
+        return next_token
 
 
 class NGramLanguageModelReader:
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index f7498dc7f..f9973a58f 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -21,7 +21,10 @@ def main() -> None:
     greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
     generated_text = greedy_generator.run(51, 'Vernon')
 
-    result = generated_text
+    beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7)
+    resulted_text = beam_search_generator.run('Vernon', 56)
+
+    result = resulted_text
     assert result
 
 
diff --git a/lab_3_generate_by_ngrams/target_score.txt b/lab_3_generate_by_ngrams/target_score.txt
index 1e8b31496..45a4fb75d 100644
--- a/lab_3_generate_by_ngrams/target_score.txt
+++ b/lab_3_generate_by_ngrams/target_score.txt
@@ -1 +1 @@
-6
+8

From c8f555386eaf65b9d5473f26be6e3d0ab191004b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 18:58:15 +0300
Subject: [PATCH 45/68] code style fixing

---
 lab_3_generate_by_ngrams/main.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index b2b918038..15103e705 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -319,7 +319,7 @@ def build(self) -> int:
                 return 1
             p_w_1_2 = n_grams.count(ngram)
             p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1])
-            self._n_gram_frequencies[ngram] = p_w_1_2/p_w_1
+            self._n_gram_frequencies[ngram] = p_w_1_2 / p_w_1
 
         return 0
 
@@ -335,15 +335,16 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
 
         In case of corrupt input arguments, None is returned
         """
-        if not isinstance(sequence, tuple) or len(sequence) == 0 or len(sequence) < self._n_gram_size - 1:
+        if (not isinstance(sequence, tuple) or len(sequence) == 0
+                or len(sequence) < self._n_gram_size - 1):
             return None
 
         possible_tokens = {}
 
         context = sequence[-(self._n_gram_size - 1)::]
-        for ngram in self._n_gram_frequencies:
+        for ngram, freq in self._n_gram_frequencies.items():
             if ngram[:self._n_gram_size - 1] == context:
-                possible_tokens[ngram[-1]] = self._n_gram_frequencies[ngram]
+                possible_tokens[ngram[-1]] = freq
 
         return possible_tokens
 
@@ -417,7 +418,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
         text = prompt
 
         for i in range(seq_len):
-            tokens = self._model.generate_next_token(encoded_prompt[-ngram_size+1:])
+            tokens = self._model.generate_next_token(encoded_prompt[-ngram_size + 1:])
             if tokens is None:
                 return f"{prompt}."
             if len(tokens) == 0:
@@ -488,7 +489,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
         if not generated_dict:
             return []
 
-        return sorted([(token, freq) for token, freq in generated_dict.items()],
+        return sorted(list((token, freq) for token, freq in generated_dict.items()),
                       key=lambda pair: pair[1], reverse=True)[:self._beam_width]
 
 
@@ -513,11 +514,12 @@ def continue_sequence(
 
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
-        if (not isinstance(sequence,tuple) or len(sequence) == 0 or
+        if (not isinstance(sequence, tuple) or len(sequence) == 0 or
             not isinstance(next_tokens, list) or len(next_tokens) == 0 or
-            not isinstance(sequence_candidates, dict) or not sequence_candidates or
-            len(next_tokens) >= self._beam_width or
-            sequence not in sequence_candidates):
+            not isinstance(sequence_candidates, dict) or not sequence_candidates):
+            return None
+        if (len(next_tokens) >= self._beam_width or
+                sequence not in sequence_candidates):
             return None
 
         result_dict_cand = sequence_candidates.copy()
@@ -544,7 +546,8 @@ def prune_sequence_candidates(
         if not isinstance(sequence_candidates, dict) or not sequence_candidates:
             return None
 
-        sorted_sequences = sorted(sequence_candidates.items(), key=lambda item: item[1], reverse=True)
+        sorted_sequences = sorted(sequence_candidates.items(),
+                                  key=lambda item: item[1], reverse=True)
 
         return dict(sorted_sequences[:self._beam_width])
 
@@ -621,7 +624,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
                     return None
                 candidates = best_sequence
 
-            decoded_result = self._text_processor.decode(sorted(tuple(candidates), key=lambda item: item[1])[0])
+            decoded_result = self._text_processor.decode(sorted(tuple(candidates),
+                                                                key=lambda item: item[1])[0])
 
             return decoded_result
 

From 8e61901253a3ce562f4954b2e28f66c9b38a22ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 19:03:38 +0300
Subject: [PATCH 46/68] code style and import fixing

---
 lab_3_generate_by_ngrams/main.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 15103e705..452471597 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -5,6 +5,7 @@
 """
 # pylint:disable=too-few-public-methods
 from typing import Optional
+
 import math
 
 
@@ -515,8 +516,10 @@ def continue_sequence(
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
         if (not isinstance(sequence, tuple) or len(sequence) == 0 or
-            not isinstance(next_tokens, list) or len(next_tokens) == 0 or
-            not isinstance(sequence_candidates, dict) or not sequence_candidates):
+                not isinstance(next_tokens, list)):
+            return None
+        if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or
+                not sequence_candidates):
             return None
         if (len(next_tokens) >= self._beam_width or
                 sequence not in sequence_candidates):

From a4fd670ef1832dc684ae48f3f56453e838cc7ddc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 19:06:31 +0300
Subject: [PATCH 47/68] import style fixing

---
 lab_3_generate_by_ngrams/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 452471597..43786dd4a 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -4,9 +4,8 @@
 Beam-search and natural language generation evaluation
 """
 # pylint:disable=too-few-public-methods
-from typing import Optional
-
 import math
+from typing import Optional
 
 
 class TextProcessor:

From 79520f1ce28580125fa89ca3d18d60f91e3ca7cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 22:48:40 +0300
Subject: [PATCH 48/68] mypy fixing

---
 lab_3_generate_by_ngrams/main.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 43786dd4a..da0db7d5c 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -520,15 +520,16 @@ def continue_sequence(
         if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or
                 not sequence_candidates):
             return None
-        if (len(next_tokens) >= self._beam_width or
+        if (len(next_tokens) > self._beam_width or
                 sequence not in sequence_candidates):
             return None
 
-        result_dict_cand = sequence_candidates.copy()
-        for token, freq in next_tokens:
-            result_dict_cand[sequence + (token,)] = freq - math.log(freq)
+        for (token, freq) in next_tokens:
+            sequence_candidates[sequence + (token,)] = \
+                sequence_candidates[sequence] - math.log(freq)
+        sequence_candidates.pop(sequence)
 
-        return result_dict_cand
+        return sequence_candidates
 
 
     def prune_sequence_candidates(
@@ -585,6 +586,8 @@ def __init__(
         self._beam_width = beam_width
         self._beam_searchers = BeamSearcher(self._beam_width, self._language_model)
 
+        return None
+
     def run(self, prompt: str, seq_len: int) -> Optional[str]:
         """
         Generate sequence based on NGram language model and prompt provided.
@@ -600,7 +603,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
         None is returned
         """
         if (not isinstance(prompt, str) or len(prompt) == 0 or
-            not isinstance(seq_len, int) or seq_len < 0):
+                not isinstance(seq_len, int) or seq_len < 0):
             return None
 
         encoded_prompt = self._text_processor.encode(prompt)
@@ -609,7 +612,7 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
         candidates = {encoded_prompt: 0.0}
 
         for i in range(seq_len):
-            new_candidates = candidates.copy()
+            new_candidates = dict(candidates)
             for sequence in candidates:
                 next_tokens = self._get_next_token(sequence)
                 if next_tokens is None:

From 6fe7fdcd1474369e410037c769aa8132335ccb71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 22:52:20 +0300
Subject: [PATCH 49/68] i dont understand....

---
 lab_3_generate_by_ngrams/main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index da0db7d5c..41159d086 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -586,8 +586,6 @@ def __init__(
         self._beam_width = beam_width
         self._beam_searchers = BeamSearcher(self._beam_width, self._language_model)
 
-        return None
-
     def run(self, prompt: str, seq_len: int) -> Optional[str]:
         """
         Generate sequence based on NGram language model and prompt provided.

From 950d83b000a9d38220a72f5c929e611aedf12040 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 23:07:43 +0300
Subject: [PATCH 50/68] mypy fixing

---
 lab_3_generate_by_ngrams/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 41159d086..53619e38b 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -134,9 +134,10 @@ def encode(self, text: str) -> Optional[tuple[int, ...]]:
 
         encoded_corpus = []
         for token in tokenized_text:
-            if self.get_id(token) is None:
+            ident = self.get_id(token)
+            if ident is None:
                 return None
-            encoded_corpus.append(self.get_id(token))
+            encoded_corpus.append(ident)
 
         return tuple(encoded_corpus)
 

From 51d425fa62712d9c71199471a792dac2ad4bef70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 23:13:58 +0300
Subject: [PATCH 51/68] mypy fixing

---
 lab_3_generate_by_ngrams/main.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 53619e38b..27ea6ddd8 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -490,8 +490,13 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
         if not generated_dict:
             return []
 
-        return sorted(list((token, freq) for token, freq in generated_dict.items()),
-                      key=lambda pair: pair[1], reverse=True)[:self._beam_width]
+        list_of_token_pairs = []
+        for token, freq in generated_dict.items():
+            token_pair = (token, float(freq))
+            list_of_token_pairs.append(token_pair)
+        best = sorted(list_of_token_pairs, key=lambda x: x[1], reverse=True)[:self._beam_width]
+
+        return best
 
 
     def continue_sequence(

From a114cdc50680ba8fd9e937dbe82266ab639f2d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Wed, 22 Nov 2023 23:20:09 +0300
Subject: [PATCH 52/68] mypy fixing

---
 lab_3_generate_by_ngrams/start.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index f9973a58f..d4c1b7945 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -15,17 +15,18 @@ def main() -> None:
         text = text_file.read()
     text_processor = main_py.TextProcessor('_')
     encoded_text = text_processor.encode(text)
-    decoded_text = text_processor.decode(encoded_text)
+    if isinstance(encoded_text, tuple) and encoded_text is not None:
+        decoded_text = text_processor.decode(encoded_text)
 
-    language_model = main_py.NGramLanguageModel(encoded_text, 7)
-    greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
-    generated_text = greedy_generator.run(51, 'Vernon')
+        language_model = main_py.NGramLanguageModel(encoded_text, 7)
+        greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
+        generated_text = greedy_generator.run(51, 'Vernon')
 
-    beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7)
-    resulted_text = beam_search_generator.run('Vernon', 56)
+        beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7)
+        resulted_text = beam_search_generator.run('Vernon', 56)
 
-    result = resulted_text
-    assert result
+        result = resulted_text
+        assert result
 
 
 if __name__ == "__main__":

From 6948e044d254c4c6fe6f49face858ed1ce7c05cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 23 Nov 2023 17:07:46 +0300
Subject: [PATCH 53/68] mypy fixing

---
 lab_3_generate_by_ngrams/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 27ea6ddd8..f7542eb78 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -626,7 +626,8 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
                     self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates)
                 )
                 if continued_sentence is None:
-                    break
+                    return self._text_processor.decode(sorted(tuple(candidates),
+                                                              key=lambda pair: pair[1])[0])
 
                 best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates)
                 if best_sequence is None:

From 908cacbe8e922776588d8b2ba3fdcef617286b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 23 Nov 2023 23:09:21 +0300
Subject: [PATCH 54/68] fixing all

---
 lab_3_generate_by_ngrams/main.py  | 76 ++++++++++++-------------------
 lab_3_generate_by_ngrams/start.py |  3 ++
 2 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index f7542eb78..d9fcf6dbb 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -56,7 +56,7 @@ def _tokenize(self, text: str) -> Optional[tuple[str, ...]]:
         if not text[-1].isalnum():
             tokenized_text.append(self._end_of_word_token)
 
-        if len(tokenized_text) == 0:
+        if not tokenized_text:
             return None
 
         return tuple(tokenized_text)
@@ -158,7 +158,6 @@ def _put(self, element: str) -> None:
 
         return None
 
-
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
         Decode and postprocess encoded corpus by converting integer identifiers to string.
@@ -241,22 +240,13 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> Optional
         if not isinstance(decoded_corpus, tuple) or len(decoded_corpus) == 0:
             return None
 
-        resulting_text = ""
-        for index, token in enumerate(decoded_corpus):
-            if index == 0:
-                resulting_text += token.upper()
-            elif token == self._end_of_word_token:
-                if index == len(decoded_corpus) - 1:
-                    resulting_text = f"{resulting_text}."
-                else:
-                    resulting_text = f"{resulting_text} "
-            else:
-                resulting_text += token
+        text_string = ''.join(list(decoded_corpus))
+        resulting_text = text_string.replace(self._end_of_word_token, ' ')
+        resulting_text = f"{resulting_text.capitalize().strip()}."
 
         return resulting_text
 
 
-
 class NGramLanguageModel:
     """
     Store language model by n_grams, predict the next token.
@@ -336,15 +326,15 @@ def generate_next_token(self, sequence: tuple[int, ...]) -> Optional[dict]:
 
         In case of corrupt input arguments, None is returned
         """
-        if (not isinstance(sequence, tuple) or len(sequence) == 0
-                or len(sequence) < self._n_gram_size - 1):
+        if not (isinstance(sequence, tuple) and sequence
+                and len(sequence) >= self._n_gram_size - 1):
             return None
 
         possible_tokens = {}
 
-        context = sequence[-(self._n_gram_size - 1)::]
+        context = sequence[-(self._n_gram_size - 1):]
         for ngram, freq in self._n_gram_frequencies.items():
-            if ngram[:self._n_gram_size - 1] == context:
+            if ngram[:- 1] == context:
                 possible_tokens[ngram[-1]] = freq
 
         return possible_tokens
@@ -435,7 +425,7 @@ def run(self, seq_len: int, prompt: str) -> Optional[str]:
                 return None
             text += best_candidate
 
-        decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}."
+        decoded_prompt = f"{self._text_processor.decode(encoded_prompt)}"
         if decoded_prompt is None:
             return None
 
@@ -498,7 +488,6 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
 
         return best
 
-
     def continue_sequence(
         self,
         sequence: tuple[int, ...],
@@ -537,7 +526,6 @@ def continue_sequence(
 
         return sequence_candidates
 
-
     def prune_sequence_candidates(
         self, sequence_candidates: dict[tuple[int, ...], float]
     ) -> Optional[dict[tuple[int, ...], float]]:
@@ -555,13 +543,12 @@ def prune_sequence_candidates(
         if not isinstance(sequence_candidates, dict) or not sequence_candidates:
             return None
 
-        sorted_sequences = sorted(sequence_candidates.items(),
-                                  key=lambda item: item[1], reverse=True)
+        sorted_sequences = sorted(list(sequence_candidates.items()),
+                                  key=lambda item: item[1])
 
         return dict(sorted_sequences[:self._beam_width])
 
 
-
 class BeamSearchTextGenerator:
     """
     Class for text generation with BeamSearch.
@@ -590,7 +577,7 @@ def __init__(
         self._language_model = language_model
         self._text_processor = text_processor
         self._beam_width = beam_width
-        self._beam_searchers = BeamSearcher(self._beam_width, self._language_model)
+        self.beam_searcher = BeamSearcher(self._beam_width, self._language_model)
 
     def run(self, prompt: str, seq_len: int) -> Optional[str]:
         """
@@ -606,12 +593,12 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
         In case of corrupt input arguments or methods used return None,
         None is returned
         """
-        if (not isinstance(prompt, str) or len(prompt) == 0 or
-                not isinstance(seq_len, int) or seq_len < 0):
+        if not (isinstance(prompt, str) and prompt
+                and isinstance(seq_len, int) and seq_len):
             return None
 
         encoded_prompt = self._text_processor.encode(prompt)
-        if encoded_prompt is None:
+        if not encoded_prompt:
             return None
         candidates = {encoded_prompt: 0.0}
 
@@ -619,28 +606,24 @@ def run(self, prompt: str, seq_len: int) -> Optional[str]:
             new_candidates = dict(candidates)
             for sequence in candidates:
                 next_tokens = self._get_next_token(sequence)
-                if next_tokens is None:
+                if not next_tokens:
                     return None
 
-                continued_sentence = (
-                    self._beam_searchers.continue_sequence(sequence, next_tokens, new_candidates)
+                continued_sequence = (
+                    self.beam_searcher.continue_sequence(sequence, next_tokens, new_candidates)
                 )
-                if continued_sentence is None:
-                    return self._text_processor.decode(sorted(tuple(candidates),
-                                                              key=lambda pair: pair[1])[0])
-
-                best_sequence = self._beam_searchers.prune_sequence_candidates(new_candidates)
-                if best_sequence is None:
-                    return None
-                candidates = best_sequence
-
-            decoded_result = self._text_processor.decode(sorted(tuple(candidates),
-                                                                key=lambda item: item[1])[0])
-
-            return decoded_result
+                if not continued_sequence:
+                    break
 
+            best_sequence = self.beam_searcher.prune_sequence_candidates(new_candidates)
+            if best_sequence is None:
+                return None
+            candidates = best_sequence
 
+        decoded_result = self._text_processor.decode(sorted(tuple(candidates),
+                                                            key=lambda item: item[1])[0])
 
+        return decoded_result
 
     def _get_next_token(
         self, sequence_to_continue: tuple[int, ...]
@@ -657,10 +640,11 @@ def _get_next_token(
 
         In case of corrupt input arguments return None.
         """
-        if not isinstance(sequence_to_continue, tuple) or len(sequence_to_continue) == 0:
+        if not(isinstance(sequence_to_continue, tuple)
+               and sequence_to_continue):
             return None
 
-        next_token = self._beam_searchers.get_next_token(sequence_to_continue)
+        next_token = self.beam_searcher.get_next_token(sequence_to_continue)
         if next_token is None:
             return None
 
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index d4c1b7945..54f944a3a 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -17,13 +17,16 @@ def main() -> None:
     encoded_text = text_processor.encode(text)
     if isinstance(encoded_text, tuple) and encoded_text is not None:
         decoded_text = text_processor.decode(encoded_text)
+        print(decoded_text)
 
         language_model = main_py.NGramLanguageModel(encoded_text, 7)
         greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
         generated_text = greedy_generator.run(51, 'Vernon')
+        print(generated_text)
 
         beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7)
         resulted_text = beam_search_generator.run('Vernon', 56)
+        print(resulted_text)
 
         result = resulted_text
         assert result

From 4be43b7423fa9cbf361b6c887332e0cb75b77b25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Thu, 23 Nov 2023 23:38:39 +0300
Subject: [PATCH 55/68] fixing start

---
 lab_3_generate_by_ngrams/main.py  | 21 +++++++++------------
 lab_3_generate_by_ngrams/start.py | 14 +++++++++-----
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index d9fcf6dbb..4e58402b2 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -306,10 +306,9 @@ def build(self) -> int:
             return 1
 
         for ngram in set(n_grams):
-            if not isinstance(ngram, tuple):
-                return 1
             p_w_1_2 = n_grams.count(ngram)
-            p_w_1 = [context[:-1] for context in n_grams].count(ngram[:-1])
+            p_w_1 = len([context for context in n_grams
+                        if context[:-1] == ngram[:-1]])
             self._n_gram_frequencies[ngram] = p_w_1_2 / p_w_1
 
         return 0
@@ -471,7 +470,7 @@ def get_next_token(self, sequence: tuple[int, ...]) -> Optional[list[tuple[int,
 
         In case of corrupt input arguments or methods used return None.
         """
-        if not isinstance(sequence, tuple) or len(sequence) == 0:
+        if not (isinstance(sequence, tuple) and sequence):
             return None
 
         generated_dict = self._model.generate_next_token(sequence)
@@ -509,14 +508,12 @@ def continue_sequence(
 
         In case of corrupt input arguments or unexpected behaviour of methods used return None.
         """
-        if (not isinstance(sequence, tuple) or len(sequence) == 0 or
-                not isinstance(next_tokens, list)):
-            return None
-        if (len(next_tokens) == 0 or not isinstance(sequence_candidates, dict) or
-                not sequence_candidates):
-            return None
-        if (len(next_tokens) > self._beam_width or
-                sequence not in sequence_candidates):
+        if not (isinstance(sequence, tuple) and sequence
+                and isinstance(next_tokens, list) and next_tokens
+                and isinstance(sequence_candidates, dict)
+                and sequence_candidates
+                and len(next_tokens) <= self._beam_width
+                and sequence in sequence_candidates):
             return None
 
         for (token, freq) in next_tokens:
diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index 54f944a3a..4bb5684c9 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -15,20 +15,24 @@ def main() -> None:
         text = text_file.read()
     text_processor = main_py.TextProcessor('_')
     encoded_text = text_processor.encode(text)
-    if isinstance(encoded_text, tuple) and encoded_text is not None:
+    if isinstance(encoded_text, tuple) and encoded_text:
         decoded_text = text_processor.decode(encoded_text)
         print(decoded_text)
 
-        language_model = main_py.NGramLanguageModel(encoded_text, 7)
-        greedy_generator = main_py.GreedyTextGenerator(language_model, text_processor)
+        language_model = main_py.NGramLanguageModel(encoded_text[:100], 3)
+        n_grams = language_model.build()
+        print(n_grams)
+
+        lang_model2 = main_py.NGramLanguageModel(encoded_text, 7)
+        greedy_generator = main_py.GreedyTextGenerator(lang_model2, text_processor)
         generated_text = greedy_generator.run(51, 'Vernon')
         print(generated_text)
 
-        beam_search_generator = main_py.BeamSearchTextGenerator(language_model, text_processor, 7)
+        beam_search_generator = main_py.BeamSearchTextGenerator(lang_model2, text_processor, 7)
         resulted_text = beam_search_generator.run('Vernon', 56)
         print(resulted_text)
 
-        result = resulted_text
+        result = decoded_text
         assert result
 
 

From d1bc24d6696a2afea91d079e57fe5e6b8b8d1f47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 24 Nov 2023 11:44:49 +0300
Subject: [PATCH 56/68] fixing return

---
 lab_3_generate_by_ngrams/main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 4e58402b2..71f2fc76b 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -156,8 +156,6 @@ def _put(self, element: str) -> None:
 
         self._storage[element] = len(self._storage)
 
-        return None
-
     def decode(self, encoded_corpus: tuple[int, ...]) -> Optional[str]:
         """
         Decode and postprocess encoded corpus by converting integer identifiers to string.

From 43e0cea47ffb5c28e2ed263cd1b3d798f6d47d58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Fri, 24 Nov 2023 11:49:00 +0300
Subject: [PATCH 57/68] fixing return

---
 lab_3_generate_by_ngrams/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_3_generate_by_ngrams/main.py b/lab_3_generate_by_ngrams/main.py
index 71f2fc76b..501cd6d3e 100644
--- a/lab_3_generate_by_ngrams/main.py
+++ b/lab_3_generate_by_ngrams/main.py
@@ -152,7 +152,7 @@ def _put(self, element: str) -> None:
         an element is not added to storage
         """
         if not isinstance(element, str) or len(element) != 1 or element in self._storage:
-            return None
+            return
 
         self._storage[element] = len(self._storage)
 

From 26c5222e1308fe87d7fc918e05d3a04058ce178b Mon Sep 17 00:00:00 2001
From: artyomtugaryov <artyomtugaryov@users.noreply.github.com>
Date: Fri, 24 Nov 2023 18:37:37 +0300
Subject: [PATCH 58/68] checkout labs from the origin repository

---
 lab_3_generate_by_ngrams/start.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lab_3_generate_by_ngrams/start.py b/lab_3_generate_by_ngrams/start.py
index b7612718d..a4ec25e0f 100644
--- a/lab_3_generate_by_ngrams/start.py
+++ b/lab_3_generate_by_ngrams/start.py
@@ -4,8 +4,6 @@
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
 
-import lab_3_generate_by_ngrams.main as main_py
-
 
 def main() -> None:
     """

From 2c8daca46a97ffd3cfb37b473878f5b534912bdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:42:04 +0300
Subject: [PATCH 59/68] code for 6

---
 lab_4_fill_words_by_ngrams/main.py          | 65 +++++++++++++++++++++
 lab_4_fill_words_by_ngrams/start.py         | 13 ++++-
 lab_4_fill_words_by_ngrams/target_score.txt |  2 +-
 3 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index b739ae182..2873b8731 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -6,6 +6,7 @@
 # pylint:disable=too-few-public-methods, too-many-arguments
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
+import random
 
 
 class WordProcessor(TextProcessor):
@@ -28,6 +29,17 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(text, str) or not text:
+            raise ValueError('Type input is inappropriate or input argument is empty.')
+
+        preprocessed_text = ""
+        for element in text.lower():
+            if element in "?!.":
+                preprocessed_text += f" {self.get_end_of_word_token()}"
+            elif element.isalpha() or element.isspace():
+                preprocessed_text += element
+
+        return tuple(preprocessed_text.split(" "))
 
     def _put(self, element: str) -> None:
         """
@@ -39,6 +51,11 @@ def _put(self, element: str) -> None:
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(element, str) or not element:
+            raise ValueError('Type input is inappropriate or input argument is empty.')
+
+        if element not in self._storage:
+            self._storage[element] = len(self._storage)
 
     def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  # type: ignore
         """
@@ -56,6 +73,16 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
         Raises:
             ValueError: In case of inappropriate type input argument or if input argument is empty.
         """
+        if not isinstance(decoded_corpus, tuple) or not decoded_corpus:
+            raise ValueError('Type input is inappropriate or input argument is empty.')
+
+        words = "".join(decoded_corpus)
+        sentences = words.split(self._end_of_word_token)
+        resulted_text = ". ".join([sentence.strip().capitalize() for sentence in sentences])
+
+        if resulted_text[-1] == ' ':
+            return resulted_text[:-1]
+        return f"{resulted_text}."
 
 
 class TopPGenerator:
@@ -80,6 +107,9 @@ def __init__(
             word_processor (WordProcessor): WordProcessor instance to handle text processing
             p_value (float): Collective probability mass threshold
         """
+        self._model = language_model
+        self._word_processor = word_processor
+        self._p_value = p_value
 
     def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         """
@@ -98,6 +128,41 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
                 or if sequence has inappropriate length,
                 or if methods used return None.
         """
+        if not (isinstance(seq_len, int) and isinstance(prompt, str) and
+                seq_len > 0 and prompt):
+            raise ValueError('Type input is inappropriate or input argument is empty.')
+
+        encoded_prompt = self._word_processor.encode(prompt)
+        if encoded_prompt is None:
+            raise ValueError('None is returned')
+
+        encoded_list = list(encoded_prompt)
+        for i in range(seq_len):
+            candidates = self._model.generate_next_token(encoded_prompt)
+            if candidates is None:
+                raise ValueError('None is returned.')
+            if not candidates:
+                break
+            sorted_candidates = sorted(list(candidates.items()),
+                                       key=lambda pair: pair[1], reverse=True)
+            sum_freq = 0
+            num_candidates = 0
+            for candidate in sorted_candidates:
+                if sum_freq >= self._p_value:
+                    break
+                sum_freq += candidate[1]
+                num_candidates += 1
+
+            random_token = random.choice(sorted_candidates[:num_candidates])[0]
+            encoded_list.append(random_token)
+            encoded_prompt = tuple(encoded_list)
+
+        decoded = self._word_processor.decode(encoded_prompt)
+        if decoded is None:
+            raise ValueError('None is returned')
+
+        return decoded
+
 
 
 class GeneratorTypes:
diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index c41386377..811ebcf84 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,6 +2,9 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
+from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator,
+                                             NGramLanguageModel, TopPGenerator,
+                                             QualityChecker, WordProcessor)
 
 
 def main() -> None:
@@ -10,7 +13,15 @@ def main() -> None:
     """
     with open("./assets/Harry_Potter.txt", "r", encoding="utf-8") as text_file:
         text = text_file.read()
-    result = None
+    word_processor = WordProcessor("<eos>")
+    encoded_text = word_processor.encode(text)
+    lang_model = NGramLanguageModel(encoded_text, 2)
+    lang_model.build()
+
+    top_p_generator = TopPGenerator(lang_model, word_processor, 0.5)
+    result = top_p_generator.run(51, "Vernon")
+    print(result)
+
     assert result
 
 
diff --git a/lab_4_fill_words_by_ngrams/target_score.txt b/lab_4_fill_words_by_ngrams/target_score.txt
index 573541ac9..1e8b31496 100644
--- a/lab_4_fill_words_by_ngrams/target_score.txt
+++ b/lab_4_fill_words_by_ngrams/target_score.txt
@@ -1 +1 @@
-0
+6

From 7b1f90d40811eb82fb749ef498d7129fc7749a07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:45:35 +0300
Subject: [PATCH 60/68] import style fixing

---
 lab_4_fill_words_by_ngrams/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index 2873b8731..a2d010b2a 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -6,7 +6,7 @@
 # pylint:disable=too-few-public-methods, too-many-arguments
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
-import random
+from random import choice
 
 
 class WordProcessor(TextProcessor):
@@ -153,7 +153,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
                 sum_freq += candidate[1]
                 num_candidates += 1
 
-            random_token = random.choice(sorted_candidates[:num_candidates])[0]
+            random_token = choice(sorted_candidates[:num_candidates])[0]
             encoded_list.append(random_token)
             encoded_prompt = tuple(encoded_list)
 

From 342556df79ad39d4c92a02bcd99eaedca3c9e3b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:48:03 +0300
Subject: [PATCH 61/68] import style fixing

---
 lab_4_fill_words_by_ngrams/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index a2d010b2a..a0537d1f6 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -4,9 +4,10 @@
 Top-p sampling generation and filling gaps with ngrams
 """
 # pylint:disable=too-few-public-methods, too-many-arguments
+from random import choice
+
 from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
                                            NGramLanguageModel, TextProcessor)
-from random import choice
 
 
 class WordProcessor(TextProcessor):

From a421f39ca52371981ec37115e5c3d3947a869806 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:51:19 +0300
Subject: [PATCH 62/68] import style fixing

---
 lab_4_fill_words_by_ngrams/start.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index 811ebcf84..1be68cd31 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,6 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
+
 from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator,
                                              NGramLanguageModel, TopPGenerator,
                                              QualityChecker, WordProcessor)

From 621dcf54f606cc1313af6f521e05009b3881318f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:54:19 +0300
Subject: [PATCH 63/68] import style fixing

---
 lab_4_fill_words_by_ngrams/start.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index 1be68cd31..ee9a97ddf 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,10 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
-
-from lab_4_fill_words_by_ngrams.main import (GeneratorTypes, BeamSearchTextGenerator,
-                                             NGramLanguageModel, TopPGenerator,
-                                             QualityChecker, WordProcessor)
+from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor)
 
 
 def main() -> None:

From a9a085decb10b73fd6dff827b2cf878b5ed53955 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 21:58:50 +0300
Subject: [PATCH 64/68] import style fixing

---
 lab_4_fill_words_by_ngrams/start.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index ee9a97ddf..f3f0d7721 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,6 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
+
 from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor)
 
 

From 3000112b2540336f8e6ee5448232bb41a66cfa27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sun, 10 Dec 2023 22:07:29 +0300
Subject: [PATCH 65/68] import style and tests fixing

---
 lab_4_fill_words_by_ngrams/main.py  | 4 ++--
 lab_4_fill_words_by_ngrams/start.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index a0537d1f6..ee8de63e6 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -77,7 +77,7 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str:  #
         if not isinstance(decoded_corpus, tuple) or not decoded_corpus:
             raise ValueError('Type input is inappropriate or input argument is empty.')
 
-        words = "".join(decoded_corpus)
+        words = " ".join(decoded_corpus)
         sentences = words.split(self._end_of_word_token)
         resulted_text = ". ".join([sentence.strip().capitalize() for sentence in sentences])
 
@@ -145,7 +145,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             if not candidates:
                 break
             sorted_candidates = sorted(list(candidates.items()),
-                                       key=lambda pair: pair[1], reverse=True)
+                                       key=lambda pair: (float(pair[1]), pair[0]), reverse=True)
             sum_freq = 0
             num_candidates = 0
             for candidate in sorted_candidates:
diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index f3f0d7721..ee9a97ddf 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,7 +2,6 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
-
 from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor)
 
 

From 1aecdd0ef2bb86da5de70d2a7754913545f84aa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sat, 16 Dec 2023 14:22:37 +0300
Subject: [PATCH 66/68] import style fixing

---
 lab_4_fill_words_by_ngrams/start.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_4_fill_words_by_ngrams/start.py b/lab_4_fill_words_by_ngrams/start.py
index ee9a97ddf..a9d6d93ad 100644
--- a/lab_4_fill_words_by_ngrams/start.py
+++ b/lab_4_fill_words_by_ngrams/start.py
@@ -2,7 +2,7 @@
 Filling word by ngrams starter
 """
 # pylint:disable=too-many-locals,unused-import
-from lab_4_fill_words_by_ngrams.main import (NGramLanguageModel, TopPGenerator, WordProcessor)
+from lab_4_fill_words_by_ngrams.main import NGramLanguageModel, TopPGenerator, WordProcessor
 
 
 def main() -> None:

From 7daa0b5d1690211100873dc84721cbf55ecec460 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Sat, 16 Dec 2023 15:39:58 +0300
Subject: [PATCH 67/68] tests fixing

---
 lab_4_fill_words_by_ngrams/main.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index ee8de63e6..d0397e8c1 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -33,14 +33,16 @@ def _tokenize(self, text: str) -> tuple[str, ...]:  # type: ignore
         if not isinstance(text, str) or not text:
             raise ValueError('Type input is inappropriate or input argument is empty.')
 
-        preprocessed_text = ""
-        for element in text.lower():
-            if element in "?!.":
-                preprocessed_text += f" {self.get_end_of_word_token()}"
-            elif element.isalpha() or element.isspace():
-                preprocessed_text += element
-
-        return tuple(preprocessed_text.split(" "))
+        tokens = []
+        punctuation_signs = '?!.'
+        for word in text.lower().split():
+            cleaned_word = [letter for letter in word if letter.isalpha()]
+            if not cleaned_word:
+                continue
+            tokens.append(''.join(cleaned_word))
+            if word[-1] in punctuation_signs:
+                tokens.append(self._end_of_word_token)
+        return tuple(tokens)
 
     def _put(self, element: str) -> None:
         """
@@ -145,7 +147,7 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
             if not candidates:
                 break
             sorted_candidates = sorted(list(candidates.items()),
-                                       key=lambda pair: (float(pair[1]), pair[0]), reverse=True)
+                                       key=lambda pair: (pair[1], pair[0]), reverse=True)
             sum_freq = 0
             num_candidates = 0
             for candidate in sorted_candidates:

From ba55b5afb02a0e37592c586fc2a9d7264a1e4ed5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BD=D0=BD=D0=B0=20=D0=92=D0=BE=D1=80=D0=BE=D0=BD?=
 =?UTF-8?q?=D1=86=D0=BE=D0=B2=D0=B0?= <vorontsann@yandex.ru>
Date: Mon, 18 Dec 2023 13:45:10 +0300
Subject: [PATCH 68/68] fixing

---
 lab_4_fill_words_by_ngrams/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lab_4_fill_words_by_ngrams/main.py b/lab_4_fill_words_by_ngrams/main.py
index d0397e8c1..1fe19ee9c 100644
--- a/lab_4_fill_words_by_ngrams/main.py
+++ b/lab_4_fill_words_by_ngrams/main.py
@@ -150,10 +150,10 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
                                        key=lambda pair: (pair[1], pair[0]), reverse=True)
             sum_freq = 0
             num_candidates = 0
-            for candidate in sorted_candidates:
+            for _, freq in sorted_candidates:
                 if sum_freq >= self._p_value:
                     break
-                sum_freq += candidate[1]
+                sum_freq += freq
                 num_candidates += 1
 
             random_token = choice(sorted_candidates[:num_candidates])[0]
@@ -167,7 +167,6 @@ def run(self, seq_len: int, prompt: str) -> str:  # type: ignore
         return decoded
 
 
-
 class GeneratorTypes:
     """
     A class that represents types of generators.