diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py index c060232b..df9e1db3 100644 --- a/lab_5_scrapper/scrapper.py +++ b/lab_5_scrapper/scrapper.py @@ -2,9 +2,50 @@ Crawler implementation. """ # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable +import datetime +import json import pathlib +import random +import re +import time from typing import Pattern, Union +import requests +from bs4 import BeautifulSoup + +from core_utils.article.article import Article +from core_utils.article.io import to_meta, to_raw +from core_utils.config_dto import ConfigDTO +from core_utils.constants import ASSETS_PATH, CRAWLER_CONFIG_PATH + + +class IncorrectSeedURLError(Exception): + pass + + +class NumberOfArticlesOutOfRangeError(Exception): + pass + + +class IncorrectNumberOfArticlesError(Exception): + pass + + +class IncorrectHeadersError(Exception): + pass + + +class IncorrectEncodingError(Exception): + pass + + +class IncorrectTimeoutError(Exception): + pass + + +class IncorrectVerifyError(Exception): + pass + class Config: """ @@ -19,6 +60,18 @@ def __init__(self, path_to_config: pathlib.Path) -> None: path_to_config (pathlib.Path): Path to configuration. """ + self.path_to_config = path_to_config + self._validate_config_content() + self.config_dto = self._extract_config_content() + + self._seed_urls = self.config_dto.seed_urls + self._num_articles = self.config_dto.total_articles + self._headers = self.config_dto.headers + self._encoding = self.config_dto.encoding + self._timeout = self.config_dto.timeout + self._should_verify_certificate = self.config_dto.should_verify_certificate + self._headless_mode = self.config_dto.headless_mode + def _extract_config_content(self) -> ConfigDTO: """ Get config values. @@ -27,10 +80,52 @@ def _extract_config_content(self) -> ConfigDTO: ConfigDTO: Config values """ + with open(self.path_to_config, 'r', encoding='utf-8') as f: + config = json.load(f) + + return ConfigDTO( + config["seed_urls"], + config["total_articles_to_find_and_parse"], + config["headers"], + config["encoding"], + config["timeout"], + config["should_verify_certificate"], + config["headless_mode"], + ) + def _validate_config_content(self) -> None: """ Ensure configuration parameters are not corrupt. """ + with open(self.path_to_config, 'r', encoding='utf-8') as f: + config = json.load(f) + + if not isinstance(config['seed_urls'], list): + raise IncorrectSeedURLError + + if not (isinstance(config['seed_urls'], list) + and all(re.match(r'https?://(www.)?', seed_url) for seed_url in config['seed_urls'])): + raise IncorrectSeedURLError + + if (not isinstance(config['total_articles_to_find_and_parse'], int) or + config['total_articles_to_find_and_parse'] <= 0): + raise IncorrectNumberOfArticlesError + + if not 1 < config['total_articles_to_find_and_parse'] <= 150: + raise NumberOfArticlesOutOfRangeError + + if not isinstance(config['headers'], dict): + raise IncorrectHeadersError + + if not isinstance(config['encoding'], str): + raise IncorrectEncodingError + + if not isinstance(config['timeout'], int) or not 0 < config['timeout'] < 60: + raise IncorrectTimeoutError + + if (not isinstance(config['should_verify_certificate'], bool) or + not isinstance(config['headless_mode'], bool)): + raise IncorrectVerifyError def get_seed_urls(self) -> list[str]: """ @@ -39,6 +134,7 @@ def get_seed_urls(self) -> list[str]: Returns: list[str]: Seed urls """ + return self._seed_urls def get_num_articles(self) -> int: """ @@ -47,6 +143,7 @@ def get_num_articles(self) -> int: Returns: int: Total number of articles to scrape """ + return self._num_articles def get_headers(self) -> dict[str, str]: """ @@ -55,6 +152,7 @@ def get_headers(self) -> dict[str, str]: Returns: dict[str, str]: Headers """ + return self._headers def get_encoding(self) -> str: """ @@ -63,6 +161,7 @@ def get_encoding(self) -> str: Returns: str: Encoding """ + return self._encoding def get_timeout(self) -> int: """ @@ -71,6 +170,7 @@ def get_timeout(self) -> int: Returns: int: Number of seconds to wait for response """ + return self._timeout def get_verify_certificate(self) -> bool: """ @@ -79,6 +179,7 @@ def get_verify_certificate(self) -> bool: Returns: bool: Whether to verify certificate or not """ + return self._should_verify_certificate def get_headless_mode(self) -> bool: """ @@ -87,6 +188,7 @@ def get_headless_mode(self) -> bool: Returns: bool: Whether to use headless mode or not """ + return self._headless_mode def make_request(url: str, config: Config) -> requests.models.Response: @@ -100,6 +202,13 @@ def make_request(url: str, config: Config) -> requests.models.Response: Returns: requests.models.Response: A response from a request """ + periods = random.randrange(3) + time.sleep(periods) + + response = requests.get(url=url, headers=config.get_headers(), + timeout=config.get_timeout(), + verify=config.get_verify_certificate()) + return response class Crawler: @@ -116,6 +225,9 @@ def __init__(self, config: Config) -> None: Args: config (Config): Configuration """ + self.urls = [] + self.config = config + self.url_pattern = 'https://www.comnews.ru/' def _extract_url(self, article_bs: BeautifulSoup) -> str: """ @@ -128,10 +240,37 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str: str: Url from HTML """ + links = article_bs.find(name='div', class_='region region-content') + for link in links.find_all('a'): + if link.get('href').startswith("/content/"): + url = self.url_pattern + link.get('href') + if url not in self.urls: + self.urls.append(url) + return url + def find_articles(self) -> None: """ Find articles. """ + seed_urls = self.get_search_urls() + n_len = self.config.get_num_articles() + + while len(self.urls) < n_len: + + for seed_url in seed_urls: + response = make_request(seed_url, self.config) + if not response.ok: + continue + + soup = BeautifulSoup(response.text, features='html.parser') + + new_url = self._extract_url(soup) + if len(self.urls) >= n_len: + break + self.urls.append(new_url) + + if len(self.urls) >= n_len: + break def get_search_urls(self) -> list: """ @@ -140,6 +279,7 @@ def get_search_urls(self) -> list: Returns: list: seed_urls param """ + return self.config.get_seed_urls() # 10 @@ -160,6 +300,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None: article_id (int): Article id config (Config): Configuration """ + self.full_url = full_url + self.article_id = article_id + self.config = config + self.article = Article(self.full_url, self.article_id) def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: """ @@ -168,6 +312,11 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + texts = [] + text_paragraphs = article_soup.find_all(class_="field field-text full-html field-name-body") + for paragraph in text_paragraphs: + texts.append(paragraph.text) + self.article.text = ''.join(texts) def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None: """ @@ -176,6 +325,28 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + self.article.title = article_soup.find('h1').text + + date_str = article_soup.find(class_='field field-text field-name-date') + if date_str: + self.article.date = self.unify_date_format(date_str.text) + + topics = article_soup.find_all(class_='tags') + if topics: + for topic in topics: + tag = topic.find('a').text + self.article.topics.append(tag) + else: + self.article.topics.append("NOT FOUND") + + self.article.author = [] + authors = article_soup.find_all(class_="field field-text field-multiple person field-name-authors") + if authors: + for author in authors: + tmp = author.find('span').text.split(' ')[-1] + self.article.author.append(tmp) + else: + self.article.author.append("NOT FOUND") def unify_date_format(self, date_str: str) -> datetime.datetime: """ @@ -187,6 +358,7 @@ def unify_date_format(self, date_str: str) -> datetime.datetime: Returns: datetime.datetime: Datetime object """ + return datetime.datetime.strptime(date_str, '%d.%m.%Y') def parse(self) -> Union[Article, bool, list]: """ @@ -195,6 +367,13 @@ def parse(self) -> Union[Article, bool, list]: Returns: Union[Article, bool, list]: Article instance """ + response = make_request(self.full_url, self.config) + if response.ok: + article_bs = BeautifulSoup(response.text, 'html.parser') + self._fill_article_with_text(article_bs) + self._fill_article_with_meta_information(article_bs) + + return self.article def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: @@ -204,12 +383,27 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: Args: base_path (Union[pathlib.Path, str]): Path where articles stores """ + if not base_path.exists(): + base_path.mkdir(parents=True, exist_ok=True) + else: + for file in base_path.iterdir(): + file.unlink() def main() -> None: """ Entrypoint for scrapper module. """ + conf = Config(CRAWLER_CONFIG_PATH) + crawler = Crawler(conf) + crawler.find_articles() + prepare_environment(ASSETS_PATH) + + for i, url in enumerate(crawler.urls, 1): + parser = HTMLParser(url, i, conf) + article = parser.parse() + to_raw(article) + to_meta(article) if __name__ == "__main__": diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json index 771fe42b..5a9ff48f 100644 --- a/lab_5_scrapper/scrapper_config.json +++ b/lab_5_scrapper/scrapper_config.json @@ -1,9 +1,21 @@ { - "seed_urls": [], - "headers": {}, - "total_articles_to_find_and_parse": 0, - "encoding": "", - "timeout": 0, + "seed_urls": [ + "https://www.comnews.ru/news?page=1", + "https://www.comnews.ru/news?page=2", + "https://www.comnews.ru/news?page=3", + "https://www.comnews.ru/news?page=4" + ], + "headers": { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Cookie": "_ym_uid=171221515114257002; _ym_d=1712215151; _gid=GA1.2.228582562.1713528706; _ym_isad=2; _ym_visorc=w; _ga_FCML9SH5CC=GS1.1.1713528709.18.1.1713528921.59.0.0; _ga=GA1.1.1446361375.1712215151; _ga_3HVRJM3KS4=GS1.1.1713528713.18.1.1713528922.58.0.0", + "Accept-Language": "ru,en;q=0.9", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 YaBrowser/24.1.0.0 Safari/537.36", + "Sec-Fetch-Dest": "document", + "Cache-Control": "max-age=0" + }, + "total_articles_to_find_and_parse": 10, + "encoding": "utf-8", + "timeout": 10, "should_verify_certificate": true, "headless_mode": true -} +} \ No newline at end of file diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json index ee7a97c3..a44d6454 100644 --- a/lab_5_scrapper/settings.json +++ b/lab_5_scrapper/settings.json @@ -1,3 +1,3 @@ { - "target_score": 0 + "target_score": 6 } diff --git a/lab_6_pipeline/pipeline.py b/lab_6_pipeline/pipeline.py index f70f27de..6d038c27 100644 --- a/lab_6_pipeline/pipeline.py +++ b/lab_6_pipeline/pipeline.py @@ -4,17 +4,28 @@ # pylint: disable=too-few-public-methods, unused-import, undefined-variable, too-many-nested-blocks import pathlib -try: - from networkx import DiGraph -except ImportError: # pragma: no cover - DiGraph = None # type: ignore - print('No libraries installed. Failed to import.') +import spacy_udpipe +from networkx import DiGraph -from core_utils.article.article import Article +from core_utils.article.article import (Article, ArtifactType, get_article_id_from_filepath, + split_by_sentence) +from core_utils.article.io import from_raw, to_cleaned +from core_utils.constants import ASSETS_PATH, UDPIPE_MODEL_PATH from core_utils.pipeline import (AbstractCoNLLUAnalyzer, CoNLLUDocument, LibraryWrapper, PipelineProtocol, StanzaDocument, TreeNode) +class InconsistentDatasetError(Exception): + """IDs contain slips, number of meta and raw files is not equal, files are empty""" + + +class EmptyDirectoryError(Exception): + """directory is empty""" + + +class EmptyFileError(Exception): + """file is empty""" + class CorpusManager: """ Work with articles and store them. @@ -27,16 +38,45 @@ def __init__(self, path_to_raw_txt_data: pathlib.Path) -> None: Args: path_to_raw_txt_data (pathlib.Path): Path to raw txt data """ + self.path_to_raw_txt_data = path_to_raw_txt_data + self._storage = {} + self._validate_dataset() + self._scan_dataset() def _validate_dataset(self) -> None: """ Validate folder with assets. """ + if not self.path_to_raw_txt_data.exists(): + raise FileNotFoundError + + if not self.path_to_raw_txt_data.is_dir(): + raise NotADirectoryError + + if not list(self.path_to_raw_txt_data.iterdir()): + raise EmptyDirectoryError + + raw_files = list(self.path_to_raw_txt_data.glob('*_raw.txt')) + meta_files = list(self.path_to_raw_txt_data.glob('*_meta.json')) + if len(raw_files) != len(meta_files): + raise InconsistentDatasetError + + sorted_raw_files = sorted(raw_files, key=lambda file: get_article_id_from_filepath(file)) + sorted_meta_files = sorted(meta_files, key=lambda file: get_article_id_from_filepath(file)) + + for index, (raw_file, meta_file) in enumerate(zip(sorted_raw_files, sorted_meta_files), start=1): + if (index != get_article_id_from_filepath(raw_file) + or index != get_article_id_from_filepath(meta_file) + or raw_file.stat().st_size == 0 or meta_file.stat().st_size == 0): + raise InconsistentDatasetError def _scan_dataset(self) -> None: """ Register each dataset entry. """ + for file in list(self.path_to_raw_txt_data.glob("*_raw.txt")): + art_id = get_article_id_from_filepath(file) + self._storage[art_id] = from_raw(file, Article(None, art_id)) def get_articles(self) -> dict: """ @@ -45,6 +85,7 @@ def get_articles(self) -> dict: Returns: dict: Storage params """ + return self._storage class TextProcessingPipeline(PipelineProtocol): @@ -62,11 +103,21 @@ def __init__( corpus_manager (CorpusManager): CorpusManager instance analyzer (LibraryWrapper | None): Analyzer instance """ + self._corpus_manager = corpus_manager + self._analyzer = analyzer def run(self) -> None: """ Perform basic preprocessing and write processed text to files. """ + articles = self._corpus_manager.get_articles().values() + for article in articles: + to_cleaned(article) + if self._analyzer: + texts = split_by_sentence(article.text) + text_analyze = self._analyzer.analyze(texts) + article.set_conllu_info(text_analyze) + self._analyzer.to_conllu(article) class UDPipeAnalyzer(LibraryWrapper): @@ -80,6 +131,7 @@ def __init__(self) -> None: """ Initialize an instance of the UDPipeAnalyzer class. """ + self._analyzer = self._bootstrap() def _bootstrap(self) -> AbstractCoNLLUAnalyzer: """ @@ -88,6 +140,16 @@ def _bootstrap(self) -> AbstractCoNLLUAnalyzer: Returns: AbstractCoNLLUAnalyzer: Analyzer instance """ + model = spacy_udpipe.load_from_path( + lang="ru", + path=str(UDPIPE_MODEL_PATH) + ) + model.add_pipe( + "conll_formatter", + last=True, + config={"conversion_maps": {"XPOS": {"": "_"}}, "include_headers": True}, + ) + return model def analyze(self, texts: list[str]) -> list[StanzaDocument | str]: """ @@ -99,6 +161,12 @@ def analyze(self, texts: list[str]) -> list[StanzaDocument | str]: Returns: list[StanzaDocument | str]: List of documents """ + texts_docs = [] + for text in texts: + analyzed_text = self._analyzer(text) + conllu_annotation = analyzed_text._.conll_str + texts_docs.append(conllu_annotation) + return texts_docs def to_conllu(self, article: Article) -> None: """ @@ -107,6 +175,9 @@ def to_conllu(self, article: Article) -> None: Args: article (Article): Article containing information to save """ + with open(article.get_file_path(ArtifactType.UDPIPE_CONLLU), 'w', encoding='utf-8') as annotation_file: + annotation_file.writelines(article.get_conllu_info()) + annotation_file.write("\n") class StanzaAnalyzer(LibraryWrapper): @@ -253,6 +324,10 @@ def main() -> None: """ Entrypoint for pipeline module. """ + corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH) + analyzer = UDPipeAnalyzer() + pipeline = TextProcessingPipeline(corpus_manager, analyzer) + pipeline.run() if __name__ == "__main__": diff --git a/lab_6_pipeline/settings.json b/lab_6_pipeline/settings.json index ee7a97c3..a44d6454 100644 --- a/lab_6_pipeline/settings.json +++ b/lab_6_pipeline/settings.json @@ -1,3 +1,3 @@ { - "target_score": 0 + "target_score": 6 } diff --git a/requirements.txt b/requirements.txt index 8b137891..b4aa1f7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,7 @@ - +beautifulsoup4==4.12.0 +networkx==3.2.1 +requests==2.31.0 +spacy-conll==3.4.0 +spacy-udpipe==1.0.0 +spacy==3.7.4 +stanza==1.8.2 \ No newline at end of file diff --git a/seminars/seminar_05_20_2024/try_networkx.py b/seminars/seminar_05_20_2024/try_networkx.py index 45309397..f48d550f 100644 --- a/seminars/seminar_05_20_2024/try_networkx.py +++ b/seminars/seminar_05_20_2024/try_networkx.py @@ -118,14 +118,14 @@ def family_graph_task() -> nx.DiGraph: Returns: nx.DiGraph: graph as in [/images/task_2_family_graph.png] """ - # relatives = { - # "Настя": {"age": 66, "hair": "blonde"}, - # "Дима": {"age": 70, "hair": "ginger"}, - # "Степа": {"age": 41, "hair": "black"}, - # "Вика": {"age": 40, "hair": "ginger"}, - # "Лида": {"age": 15, "hair": "black"}, - # } - # YOUR CODE GOES HERE + #relatives = { + # "Настя": {"age": 66, "hair": "blonde"}, + # "Дима": {"age": 70, "hair": "ginger"}, + # "Степа": {"age": 41, "hair": "black"}, + # "Вика": {"age": 40, "hair": "ginger"}, + # "Лида": {"age": 15, "hair": "black"}, + #} + def match_subgraph_example() -> list[dict[str, str]]: