Skip to content
This repository has been archived by the owner on Dec 25, 2023. It is now read-only.

Laboratory work #4, Mashkovtseva Alesia - 22FPL2 #196

Closed
wants to merge 122 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
122 commits
Select commit Hold shift + click to select a range
9a9950f
created a new file
alesiamashkovtseva Sep 8, 2023
594b1f8
Merge branch 'fipl-hse:main' into main
alesiamashkovtseva Sep 15, 2023
e69286a
created a new file
alesiamashkovtseva Sep 20, 2023
d308b37
created a new file
alesiamashkovtseva Sep 22, 2023
d1d283b
steps 4 and 5 done
alesiamashkovtseva Sep 28, 2023
7529864
Merge branch 'fipl-hse:main' into main
marina-kaz Sep 28, 2023
b00f1ee
tokenize() early return done
alesiamashkovtseva Sep 29, 2023
ff6464d
early return done in other functions
alesiamashkovtseva Sep 29, 2023
1b6aa1d
steps 5-6 done
alesiamashkovtseva Oct 2, 2023
b05ca58
some small changes done
alesiamashkovtseva Oct 2, 2023
9769284
some small changes done
alesiamashkovtseva Oct 2, 2023
a2e3529
some review points fixed and changes in practice_2_string.py reverted
alesiamashkovtseva Oct 4, 2023
a1d1944
code style check fixed
alesiamashkovtseva Oct 4, 2023
bc922b4
import style checks fixed
alesiamashkovtseva Oct 4, 2023
400355b
import style checks fixed
alesiamashkovtseva Oct 4, 2023
f89e2f4
a few changes done in calculate_mse and compare_profiles
alesiamashkovtseva Oct 4, 2023
4082ce5
fixing mypy checks
alesiamashkovtseva Oct 4, 2023
fbce250
fixing mypy checks
alesiamashkovtseva Oct 4, 2023
ce18afe
fixing mypy checks
alesiamashkovtseva Oct 4, 2023
8cbc6af
fixing mypy checks
alesiamashkovtseva Oct 4, 2023
a566adc
fixing import style checks
alesiamashkovtseva Oct 5, 2023
5753e05
directly returning in detect_language, no rounding MSE, unused code r…
alesiamashkovtseva Oct 5, 2023
32b11bd
code style fixed
alesiamashkovtseva Oct 5, 2023
4e3dd4c
code style fixed
alesiamashkovtseva Oct 5, 2023
15ab62a
code style and mypy checks fixed
alesiamashkovtseva Oct 5, 2023
4732515
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
a30590f
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
c777c8a
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
1cd496f
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
4f3943a
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
950adb6
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
4280fbe
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
0c74f93
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
e3ad151
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
52a3768
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
9cda9f6
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
1a9e5e9
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
a4aa4dc
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
2ef3614
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
f7b17c3
fixing mypy checks
alesiamashkovtseva Oct 5, 2023
8940480
Merge branch 'fipl-hse:main' into main
artyomtugaryov Oct 6, 2023
f02ec2e
Merge branch 'fipl-hse:main' into main
alesiamashkovtseva Oct 10, 2023
1bf2df1
prepare_word and collect_frequencies functions done
alesiamashkovtseva Oct 10, 2023
0cd0fb3
fixing check start.py
alesiamashkovtseva Oct 10, 2023
e7285ce
fixing check start.py
alesiamashkovtseva Oct 10, 2023
32b301a
fixing check start.py
alesiamashkovtseva Oct 10, 2023
c7d50b5
Merge remote-tracking branch 'origin/main' into HEAD
artyomtugaryov Oct 11, 2023
8cb26a7
checkout labs from the origin repository
artyomtugaryov Oct 11, 2023
204f73d
Merge branch 'fipl-hse:main' into main
artyomtugaryov Oct 13, 2023
581905e
steps 3-5 done
alesiamashkovtseva Oct 19, 2023
ae18055
import style check fixed
alesiamashkovtseva Oct 19, 2023
4a226cb
import style check fixed
alesiamashkovtseva Oct 19, 2023
b6b9e8f
Merge branch 'fipl-hse:main' into main
artyomtugaryov Oct 20, 2023
618d59d
rewrote train function
alesiamashkovtseva Oct 25, 2023
1c92e5a
Merge branch 'fipl-hse:main' into main
marina-kaz Oct 26, 2023
a524c36
steps 6 and 7 done, some changes done in train function
alesiamashkovtseva Nov 1, 2023
9828edd
start.py fixed
alesiamashkovtseva Nov 1, 2023
554696c
start.py fixed
alesiamashkovtseva Nov 1, 2023
d19294b
start.py fixed
alesiamashkovtseva Nov 1, 2023
840ef98
start.py, mypy and code style fixed
alesiamashkovtseva Nov 1, 2023
1179deb
fixing unittests
alesiamashkovtseva Nov 2, 2023
d841924
start.py fixing
alesiamashkovtseva Nov 2, 2023
728c257
start.py fixing
alesiamashkovtseva Nov 2, 2023
c692474
code style fixing
alesiamashkovtseva Nov 2, 2023
98daf2c
code style fixing
alesiamashkovtseva Nov 2, 2023
5923e51
code style fixing
alesiamashkovtseva Nov 2, 2023
da4b6fd
mypy fixing
alesiamashkovtseva Nov 2, 2023
a7715e0
mypy fixing
alesiamashkovtseva Nov 2, 2023
30fa70f
start.py fixing
alesiamashkovtseva Nov 2, 2023
c2c2aaf
start.py fixing
alesiamashkovtseva Nov 2, 2023
4ee29c1
merge_tokens fixing
alesiamashkovtseva Nov 3, 2023
99f7bbc
Merge remote-tracking branch 'origin/main' into HEAD
artyomtugaryov Nov 3, 2023
9a8118d
checkout labs from the origin repository
artyomtugaryov Nov 3, 2023
4dcbcb1
checkout labs from the origin repository
artyomtugaryov Nov 3, 2023
c75223c
step 1 done
alesiamashkovtseva Nov 8, 2023
85e30a1
step 2 done
alesiamashkovtseva Nov 9, 2023
fe10f81
step 3 done
alesiamashkovtseva Nov 9, 2023
60f0207
TextProcessor arg _end_of_word_token fixed
alesiamashkovtseva Nov 9, 2023
fe72367
TextProcessor arg _end_of_word_token fixed
alesiamashkovtseva Nov 9, 2023
2df7560
TextProcessor arg _storage fixed
alesiamashkovtseva Nov 9, 2023
194d4b1
Merge branch 'fipl-hse:main' into main
artyomtugaryov Nov 10, 2023
e985357
Merge branch 'fipl-hse:main' into main
artyomtugaryov Nov 17, 2023
6d3833f
commit
alesiamashkovtseva Nov 17, 2023
7ef974a
step 4.3 done
alesiamashkovtseva Nov 21, 2023
8def004
step 5.2 done
alesiamashkovtseva Nov 22, 2023
55816fc
beamsearchtextgenerator run()
alesiamashkovtseva Nov 23, 2023
35ff145
BeamSearchTextGenerator fixed
alesiamashkovtseva Nov 29, 2023
50c9f16
fill_from_ngrams, set_n_grams, reader init and load
alesiamashkovtseva Nov 30, 2023
661dd90
reader load fixed
alesiamashkovtseva Nov 30, 2023
72facdd
Merge remote-tracking branch 'origin/main' into HEAD
artyomtugaryov Nov 30, 2023
ad2890c
checkout labs from the origin repository
artyomtugaryov Nov 30, 2023
c612256
GreedyTextGenerator run() fixed, BackOffGenerator
alesiamashkovtseva Nov 30, 2023
3d6af13
import fixing
alesiamashkovtseva Nov 30, 2023
7210c65
import fixing
alesiamashkovtseva Nov 30, 2023
a7a14e5
start.py fixing
alesiamashkovtseva Nov 30, 2023
362c9e3
code style fixing
alesiamashkovtseva Nov 30, 2023
0ac0542
code style fixing
alesiamashkovtseva Nov 30, 2023
007e542
MyPy check fixing
alesiamashkovtseva Nov 30, 2023
266740d
MyPy check fixing
alesiamashkovtseva Nov 30, 2023
daa3a94
MyPy check fixing
alesiamashkovtseva Nov 30, 2023
f82708b
MyPy check fixing
alesiamashkovtseva Nov 30, 2023
198855b
Merge branch 'fipl-hse:main' into main
artyomtugaryov Dec 1, 2023
c8b445c
fixed
alesiamashkovtseva Dec 1, 2023
ff7301d
fixed
alesiamashkovtseva Dec 1, 2023
a42431b
fixed
alesiamashkovtseva Dec 1, 2023
72c0ae3
fixed
alesiamashkovtseva Dec 1, 2023
6559d6b
Merge with main
artyomtugaryov Dec 2, 2023
f5c6690
Merge branch 'fipl-hse:main' into main
artyomtugaryov Dec 4, 2023
aa45f59
Merge branch 'fipl-hse:main' into main
alesiamashkovtseva Dec 8, 2023
bdb05b4
steps 1-3 done
alesiamashkovtseva Dec 5, 2023
a491f5b
step 4 done
alesiamashkovtseva Dec 8, 2023
b0d5366
step 5 done
alesiamashkovtseva Dec 12, 2023
b136631
fixing
alesiamashkovtseva Dec 12, 2023
8d5b3b2
fixing
alesiamashkovtseva Dec 12, 2023
965f941
fixing
alesiamashkovtseva Dec 12, 2023
5e034c7
fixing
alesiamashkovtseva Dec 12, 2023
f07fcd4
fixing
alesiamashkovtseva Dec 12, 2023
dc81d08
fixing
alesiamashkovtseva Dec 18, 2023
38a30a0
fixing
alesiamashkovtseva Dec 18, 2023
e5bfd33
fixing
alesiamashkovtseva Dec 18, 2023
4b624f8
fixing
alesiamashkovtseva Dec 18, 2023
a722d58
fixing
alesiamashkovtseva Dec 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 215 additions & 1 deletion lab_4_fill_words_by_ngrams/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

Top-p sampling generation and filling gaps with ngrams
"""
import json
import math
from random import choice

# pylint:disable=too-few-public-methods, too-many-arguments
from lab_3_generate_by_ngrams.main import (BeamSearchTextGenerator, GreedyTextGenerator,
NGramLanguageModel, TextProcessor)
Expand All @@ -28,6 +32,26 @@ def _tokenize(self, text: str) -> tuple[str, ...]: # type: ignore
Raises:
ValueError: In case of inappropriate type input argument or if input argument is empty.
"""
if not isinstance(text, str) or not text:
raise ValueError('Type input is inappropriate or input argument is empty.')

for digit in ('.', '!', '?'):
text = text.replace(digit, f" {self._end_of_word_token} ")

tokenized_word = []
for word in text.lower().split():
if word == self._end_of_word_token or word.isalpha() or word.isspace():
tokenized_word.append(word)
continue

clean_word = []
for alpha in list(word):
if alpha.isalpha():
clean_word.append(alpha)
if clean_word:
tokenized_word.append("".join(clean_word))

return tuple(tokenized_word)

def _put(self, element: str) -> None:
"""
Expand All @@ -39,6 +63,12 @@ def _put(self, element: str) -> None:
Raises:
ValueError: In case of inappropriate type input argument or if input argument is empty.
"""
if not isinstance(element, str):
raise ValueError('Type input is inappropriate.')
if not element:
raise ValueError('Input argument is empty.')
if element not in self._storage:
self._storage[element] = len(self._storage)

def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: # type: ignore
"""
Expand All @@ -56,6 +86,22 @@ def _postprocess_decoded_text(self, decoded_corpus: tuple[str, ...]) -> str: #
Raises:
ValueError: In case of inappropriate type input argument or if input argument is empty.
"""
if not isinstance(decoded_corpus, tuple) or not decoded_corpus:
raise ValueError('Type input is inappropriate or input argument is empty.')
result = ''
for word in decoded_corpus:
if word == self.get_end_of_word_token():
result += '.'
elif not result:
result += word.capitalize()
elif result[-1] == '.':
result += ' ' + word.capitalize()
else:
result += ' ' + word

if result[-1] != '.':
result += '.'
return result


class TopPGenerator:
Expand All @@ -69,7 +115,7 @@ class TopPGenerator:
"""

def __init__(
self, language_model: NGramLanguageModel, word_processor: WordProcessor, p_value: float
self, language_model: NGramLanguageModel, word_processor: WordProcessor, p_value: float
) -> None:
"""
Initialize an instance of TopPGenerator.
Expand All @@ -80,6 +126,9 @@ def __init__(
word_processor (WordProcessor): WordProcessor instance to handle text processing
p_value (float): Collective probability mass threshold
"""
self._model = language_model
self._word_processor = word_processor
self._p_value = p_value

def run(self, seq_len: int, prompt: str) -> str: # type: ignore
"""
Expand All @@ -98,6 +147,35 @@ def run(self, seq_len: int, prompt: str) -> str: # type: ignore
or if sequence has inappropriate length,
or if methods used return None.
"""
if not isinstance(seq_len, int) or seq_len < 0 \
or not isinstance(prompt, str) or not prompt:
raise ValueError('Type input is inappropriate or input argument is empty.')
encoded_prompt = self._word_processor.encode(prompt)
if encoded_prompt is None:
raise ValueError('None is returned')
encoded_list = list(encoded_prompt)
for _ in range(seq_len):
candidates = self._model.generate_next_token(encoded_prompt)
if candidates is None:
raise ValueError('None is returned')
if not candidates:
break
tuple_candidates = tuple(candidates.items())
sorted_candidates = sorted(tuple_candidates, key=lambda tup: (-tup[1], -tup[0]))
sum_freq = 0
num_candidates = 0
for candidate in sorted_candidates:
if sum_freq >= self._p_value:
break
sum_freq += candidate[1]
num_candidates += 1
rand_token = choice(sorted_candidates[:num_candidates])[0]
encoded_list.append(rand_token)
encoded_prompt = tuple(encoded_list)
decoded = self._word_processor.decode(encoded_prompt)
if decoded is None:
raise ValueError('None is returned')
return decoded


class GeneratorTypes:
Expand All @@ -114,6 +192,9 @@ def __init__(self) -> None:
"""
Initialize an instance of GeneratorTypes.
"""
self.greedy = 0
self.top_p = 1
self.beam_search = 2

def get_conversion_generator_type(self, generator_type: int) -> str: # type: ignore
"""
Expand All @@ -125,6 +206,13 @@ def get_conversion_generator_type(self, generator_type: int) -> str: # type: ig
Returns:
(str): Name of the generator.
"""
if generator_type == self.greedy:
return 'Greedy Generator'
if generator_type == self.top_p:
return 'Top-P Generator'
if generator_type == self.beam_search:
return 'Beam Search Generator'
return ''


class GenerationResultDTO:
Expand All @@ -147,6 +235,9 @@ def __init__(self, text: str, perplexity: float, generation_type: int):
generation_type (int):
Numeric type of the generator for which perplexity was calculated
"""
self.__text = text
self.__perplexity = perplexity
self.__type = generation_type

def get_perplexity(self) -> float: # type: ignore
"""
Expand All @@ -155,6 +246,7 @@ def get_perplexity(self) -> float: # type: ignore
Returns:
(float): Perplexity value
"""
return self.__perplexity

def get_text(self) -> str: # type: ignore
"""
Expand All @@ -163,6 +255,7 @@ def get_text(self) -> str: # type: ignore
Returns:
(str): Text for which the perplexity was count
"""
return self.__text

def get_type(self) -> int: # type: ignore
"""
Expand All @@ -171,6 +264,7 @@ def get_type(self) -> int: # type: ignore
Returns:
(int): Numeric type of the generator
"""
return self.__type

def __str__(self) -> str: # type: ignore
"""
Expand All @@ -179,6 +273,10 @@ def __str__(self) -> str: # type: ignore
Returns:
(str): String with report
"""
generator_types = GeneratorTypes()
return (f'Perplexity score: {self.__perplexity}\n'
f'{generator_types.get_conversion_generator_type(self.__type)}\n'
f'Text: {self.__text}\n')


class QualityChecker:
Expand All @@ -203,6 +301,9 @@ def __init__(
NGramLanguageModel instance to use for text generation
word_processor (WordProcessor): WordProcessor instance to handle text processing
"""
self._generators = generators
self._language_model = language_model
self._word_processor = word_processor

def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore
"""
Expand All @@ -220,6 +321,34 @@ def _calculate_perplexity(self, generated_text: str) -> float: # type: ignore
or if methods used return None,
or if nothing was generated.
"""
if not generated_text:
raise ValueError('Input argument is empty')
if not isinstance(generated_text, str):
raise ValueError('Inappropriate type argument')
encoded_text = self._word_processor.encode(generated_text)
if not encoded_text:
raise ValueError('self._word_processor.encode() returned None')
ngram_size = self._language_model.get_n_gram_size()
l_sum = 0.0

for index in range(ngram_size - 1, len(encoded_text)):
context = tuple(encoded_text[index - ngram_size + 1: index])
token = encoded_text[index]
tokens = self._language_model.generate_next_token(context)

if tokens is None:
raise ValueError('self._language_model.generate_next_token() returned None')

probability = tokens.get(token)
if probability is None:
continue

l_sum += math.log(probability)
if not l_sum:
raise ValueError("Probability sum is 0")

result = math.exp(-l_sum / (len(encoded_text) - ngram_size))
return result

def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type: ignore
"""
Expand All @@ -239,6 +368,31 @@ def run(self, seq_len: int, prompt: str) -> list[GenerationResultDTO]: # type:
or if sequence has inappropriate length,
or if methods used return None.
"""
if not seq_len:
raise ValueError('Input argument seq_len is empty')
if not prompt:
raise ValueError('Input argument prompt is empty')
if not isinstance(seq_len, int):
raise ValueError('Inappropriate type argument seq_len')
if not isinstance(prompt, str):
raise ValueError('Inappropriate type argument prompt')

generators_inv = {value: key for key, value in self._generators.items()}
results_list = []

for generator, num_type in generators_inv.items():
text = generator.run(seq_len=seq_len, prompt=prompt)
if text is None:
raise ValueError(f'{generator} methode run() returned None')
perplexity = self._calculate_perplexity(text)
if perplexity is None:
raise ValueError(f'{generator} perplexity is None')
result = GenerationResultDTO(text, perplexity, num_type)
results_list.append((result, result.get_perplexity(), result.get_type()))

sorted_results = sorted(results_list, key=lambda tup: (tup[2], tup[1]))

return [res_tuple[0] for res_tuple in sorted_results]


class Examiner:
Expand All @@ -258,6 +412,10 @@ def __init__(self, json_path: str) -> None:
Args:
json_path (str): Local path to assets file
"""
if not isinstance(json_path, str) or not json_path:
raise ValueError
self._json_path = json_path
self._questions_and_answers = self._load_from_json()

def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore
"""
Expand All @@ -273,6 +431,25 @@ def _load_from_json(self) -> dict[tuple[str, int], str]: # type: ignore
or if attribute _json_path has inappropriate extension,
or if inappropriate type loaded data.
"""
if not isinstance(self._json_path, str):
raise ValueError('Inappropriate type of attribute _json_path')
if not self._json_path:
raise ValueError('Attribute _json_path is empty')
if not self._json_path.endswith('json'):
raise ValueError('Attribute _json_path has inappropriate extension')

with open(self._json_path, 'r', encoding="utf-8") as file:
questions = json.load(file)

if not isinstance(questions, list):
raise ValueError('Inappropriate type loaded data')

self._questions_and_answers = {
(dictionary['question'], dictionary['location']): dictionary['answer']
for dictionary in questions
}

return self._questions_and_answers

def provide_questions(self) -> list[tuple[str, int]]: # type: ignore
"""
Expand All @@ -282,6 +459,8 @@ def provide_questions(self) -> list[tuple[str, int]]: # type: ignore
list[tuple[str, int]]:
List in the form of [(question, position of the word to be filled)]
"""
questions = list(self._questions_and_answers.keys())
return questions

def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore
"""
Expand All @@ -296,6 +475,19 @@ def assess_exam(self, answers: dict[str, str]) -> float: # type: ignore
Raises:
ValueError: In case of inappropriate type input argument or if input argument is empty.
"""
if not isinstance(answers, dict):
raise ValueError('Inappropriate type input argument')
if not answers:
raise ValueError('Input argument is empty')
num_questions = 0
score = 0
right_answers = {question: answer for (question, place), answer
in self._questions_and_answers.items()}
for question in answers:
num_questions += 1
if answers[question] == right_answers[question]:
score += 1
return score / num_questions


class GeneratorRuleStudent:
Expand All @@ -318,6 +510,11 @@ def __init__(
NGramLanguageModel instance to use for text generation
word_processor (WordProcessor): WordProcessor instance to handle text processing
"""
self._generator_type = generator_type
generators = (GreedyTextGenerator(language_model, word_processor),
TopPGenerator(language_model, word_processor, 0.5),
BeamSearchTextGenerator(language_model, word_processor, 5))
self._generator = generators[generator_type]

def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ignore
"""
Expand All @@ -335,6 +532,21 @@ def take_exam(self, tasks: list[tuple[str, int]]) -> dict[str, str]: # type: ig
or if input argument is empty,
or if methods used return None.
"""
if not isinstance(tasks, list):
raise ValueError('Inappropriate type input argument')
if not tasks:
raise ValueError('Input argument is empty')
answers = {}
for (question, place) in tasks:
context = question[:place]
answer = self._generator.run(seq_len=1, prompt=context)
if answer is None:
raise ValueError('self._generator.run() returned None')
if answer[-1] == '.':
answer = answer[:-1] + ' '
result = answer + question[place:]
answers[question] = result
return answers

def get_generator_type(self) -> str: # type: ignore
"""
Expand All @@ -343,3 +555,5 @@ def get_generator_type(self) -> str: # type: ignore
Returns:
str: Generator type
"""
generator_types = GeneratorTypes()
return generator_types.get_conversion_generator_type(self._generator_type)
Loading
Loading