fipl-hse · eanushkina · Apr 1, 2024 · Apr 7, 2024 · Apr 15, 2024 · Apr 16, 2024
diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py
@@ -2,9 +2,50 @@
 Crawler implementation.
 """
 # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
+import datetime
+import json
 import pathlib
+import random
+import re
+import time
 from typing import Pattern, Union
 
+import requests
+from bs4 import BeautifulSoup
+
+from core_utils.article.article import Article
+from core_utils.article.io import to_meta, to_raw
+from core_utils.config_dto import ConfigDTO
+from core_utils.constants import ASSETS_PATH, CRAWLER_CONFIG_PATH
+
+
+class IncorrectSeedURLError(Exception):
+    pass
+
+
+class NumberOfArticlesOutOfRangeError(Exception):
+    pass
+
+
+class IncorrectNumberOfArticlesError(Exception):
+    pass
+
+
+class IncorrectHeadersError(Exception):
+    pass
+
+
+class IncorrectEncodingError(Exception):
+    pass
+
+
+class IncorrectTimeoutError(Exception):
+    pass
+
+
+class IncorrectVerifyError(Exception):
+    pass
+
 
 class Config:
     """
@@ -19,6 +60,18 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
             path_to_config (pathlib.Path): Path to configuration.
         """
 
+        self.path_to_config = path_to_config
+        self._validate_config_content()
+        self.config_dto = self._extract_config_content()
+
+        self._seed_urls = self.config_dto.seed_urls
+        self._num_articles = self.config_dto.total_articles
+        self._headers = self.config_dto.headers
+        self._encoding = self.config_dto.encoding
+        self._timeout = self.config_dto.timeout
+        self._should_verify_certificate = self.config_dto.should_verify_certificate
+        self._headless_mode = self.config_dto.headless_mode
+
     def _extract_config_content(self) -> ConfigDTO:
         """
         Get config values.
@@ -27,10 +80,52 @@ def _extract_config_content(self) -> ConfigDTO:
             ConfigDTO: Config values
         """
 
+        with open(self.path_to_config, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+
+        return ConfigDTO(
+            config["seed_urls"],
+            config["total_articles_to_find_and_parse"],
+            config["headers"],
+            config["encoding"],
+            config["timeout"],
+            config["should_verify_certificate"],
+            config["headless_mode"],
+        )
+
     def _validate_config_content(self) -> None:
         """
         Ensure configuration parameters are not corrupt.
         """
+        with open(self.path_to_config, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+
+            if not isinstance(config['seed_urls'], list):
+                raise IncorrectSeedURLError
+
+            if not (isinstance(config['seed_urls'], list)
+                    and all(re.match(r'https?://(www.)?', seed_url) for seed_url in config['seed_urls'])):
+                raise IncorrectSeedURLError
+
+            if (not isinstance(config['total_articles_to_find_and_parse'], int) or
+                    config['total_articles_to_find_and_parse'] <= 0):
+                raise IncorrectNumberOfArticlesError
+
+            if not 1 < config['total_articles_to_find_and_parse'] <= 150:
+                raise NumberOfArticlesOutOfRangeError
+
+            if not isinstance(config['headers'], dict):
+                raise IncorrectHeadersError
+
+            if not isinstance(config['encoding'], str):
+                raise IncorrectEncodingError
+
+            if not isinstance(config['timeout'], int) or not 0 < config['timeout'] < 60:
+                raise IncorrectTimeoutError
+
+            if (not isinstance(config['should_verify_certificate'], bool) or
+                    not isinstance(config['headless_mode'], bool)):
+                raise IncorrectVerifyError
 
     def get_seed_urls(self) -> list[str]:
         """
@@ -39,6 +134,7 @@ def get_seed_urls(self) -> list[str]:
         Returns:
             list[str]: Seed urls
         """
+        return self._seed_urls
 
     def get_num_articles(self) -> int:
         """
@@ -47,6 +143,7 @@ def get_num_articles(self) -> int:
         Returns:
             int: Total number of articles to scrape
         """
+        return self._num_articles
 
     def get_headers(self) -> dict[str, str]:
         """
@@ -55,6 +152,7 @@ def get_headers(self) -> dict[str, str]:
         Returns:
             dict[str, str]: Headers
         """
+        return self._headers
 
     def get_encoding(self) -> str:
         """
@@ -63,6 +161,7 @@ def get_encoding(self) -> str:
         Returns:
             str: Encoding
         """
+        return self._encoding
 
     def get_timeout(self) -> int:
         """
@@ -71,6 +170,7 @@ def get_timeout(self) -> int:
         Returns:
             int: Number of seconds to wait for response
         """
+        return self._timeout
 
     def get_verify_certificate(self) -> bool:
         """
@@ -79,6 +179,7 @@ def get_verify_certificate(self) -> bool:
         Returns:
             bool: Whether to verify certificate or not
         """
+        return self._should_verify_certificate
 
     def get_headless_mode(self) -> bool:
         """
@@ -87,6 +188,7 @@ def get_headless_mode(self) -> bool:
         Returns:
             bool: Whether to use headless mode or not
         """
+        return self._headless_mode
 
 
 def make_request(url: str, config: Config) -> requests.models.Response:
@@ -100,6 +202,13 @@ def make_request(url: str, config: Config) -> requests.models.Response:
     Returns:
         requests.models.Response: A response from a request
     """
+    periods = random.randrange(3)
+    time.sleep(periods)
+
+    response = requests.get(url=url, headers=config.get_headers(),
+                            timeout=config.get_timeout(),
+                            verify=config.get_verify_certificate())
+    return response
 
 
 class Crawler:
@@ -116,6 +225,9 @@ def __init__(self, config: Config) -> None:
         Args:
             config (Config): Configuration
         """
+        self.urls = []
+        self.config = config
+        self.url_pattern = 'https://www.comnews.ru/'
 
     def _extract_url(self, article_bs: BeautifulSoup) -> str:
         """
@@ -128,10 +240,37 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
             str: Url from HTML
         """
 
+        links = article_bs.find(name='div', class_='region region-content')
+        for link in links.find_all('a'):
+            if link.get('href').startswith("/content/"):
+                url = self.url_pattern + link.get('href')
+                if url not in self.urls:
+                    self.urls.append(url)
+                    return url
+
     def find_articles(self) -> None:
         """
         Find articles.
         """
+        seed_urls = self.get_search_urls()
+        n_len = self.config.get_num_articles()
+
+        while len(self.urls) < n_len:
+
+            for seed_url in seed_urls:
+                response = make_request(seed_url, self.config)
+                if not response.ok:
+                    continue
+
+                soup = BeautifulSoup(response.text, features='html.parser')
+
+                new_url = self._extract_url(soup)
+                if len(self.urls) >= n_len:
+                    break
+                self.urls.append(new_url)
+
+                if len(self.urls) >= n_len:
+                    break
 
     def get_search_urls(self) -> list:
         """
@@ -140,6 +279,7 @@ def get_search_urls(self) -> list:
         Returns:
             list: seed_urls param
         """
+        return self.config.get_seed_urls()
 
 
 # 10
@@ -160,6 +300,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
             article_id (int): Article id
             config (Config): Configuration
         """
+        self.full_url = full_url
+        self.article_id = article_id
+        self.config = config
+        self.article = Article(self.full_url, self.article_id)
 
     def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         """
@@ -168,6 +312,11 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        texts = []
+        text_paragraphs = article_soup.find_all(class_="field field-text full-html field-name-body")
+        for paragraph in text_paragraphs:
+            texts.append(paragraph.text)
+        self.article.text = ''.join(texts)
 
     def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
         """
@@ -176,6 +325,28 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        self.article.title = article_soup.find('h1').text
+
+        date_str = article_soup.find(class_='field field-text field-name-date')
+        if date_str:
+            self.article.date = self.unify_date_format(date_str.text)
+
+        topics = article_soup.find_all(class_='tags')
+        if topics:
+            for topic in topics:
+                tag = topic.find('a').text
+                self.article.topics.append(tag)
+        else:
+            self.article.topics.append("NOT FOUND")
+
+        self.article.author = []
+        authors = article_soup.find_all(class_="field field-text field-multiple person field-name-authors")
+        if authors:
+            for author in authors:
+                tmp = author.find('span').text.split(' ')[-1]
+                self.article.author.append(tmp)
+        else:
+            self.article.author.append("NOT FOUND")
 
     def unify_date_format(self, date_str: str) -> datetime.datetime:
         """
@@ -187,6 +358,7 @@ def unify_date_format(self, date_str: str) -> datetime.datetime:
         Returns:
             datetime.datetime: Datetime object
         """
+        return datetime.datetime.strptime(date_str, '%d.%m.%Y')
 
     def parse(self) -> Union[Article, bool, list]:
         """
@@ -195,6 +367,13 @@ def parse(self) -> Union[Article, bool, list]:
         Returns:
             Union[Article, bool, list]: Article instance
         """
+        response = make_request(self.full_url, self.config)
+        if response.ok:
+            article_bs = BeautifulSoup(response.text, 'html.parser')
+            self._fill_article_with_text(article_bs)
+            self._fill_article_with_meta_information(article_bs)
+
+        return self.article
 
 
 def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
@@ -204,12 +383,27 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
     Args:
         base_path (Union[pathlib.Path, str]): Path where articles stores
     """
+    if not base_path.exists():
+        base_path.mkdir(parents=True, exist_ok=True)
+    else:
+        for file in base_path.iterdir():
+            file.unlink()
 
 
 def main() -> None:
     """
     Entrypoint for scrapper module.
     """
+    conf = Config(CRAWLER_CONFIG_PATH)
+    crawler = Crawler(conf)
+    crawler.find_articles()
+    prepare_environment(ASSETS_PATH)
+
+    for i, url in enumerate(crawler.urls, 1):
+        parser = HTMLParser(url, i, conf)
+        article = parser.parse()
+        to_raw(article)
+        to_meta(article)
 
 
 if __name__ == "__main__":

diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json
@@ -1,9 +1,21 @@
 {
-    "seed_urls": [],
-    "headers": {},
-    "total_articles_to_find_and_parse": 0,
-    "encoding": "",
-    "timeout": 0,
+    "seed_urls": [
+      "https://www.comnews.ru/news?page=1",
+      "https://www.comnews.ru/news?page=2",
+      "https://www.comnews.ru/news?page=3",
+      "https://www.comnews.ru/news?page=4"
+    ],
+    "headers": {
+      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+      "Cookie": "_ym_uid=171221515114257002; _ym_d=1712215151; _gid=GA1.2.228582562.1713528706; _ym_isad=2; _ym_visorc=w; _ga_FCML9SH5CC=GS1.1.1713528709.18.1.1713528921.59.0.0; _ga=GA1.1.1446361375.1712215151; _ga_3HVRJM3KS4=GS1.1.1713528713.18.1.1713528922.58.0.0",
+      "Accept-Language": "ru,en;q=0.9",
+      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 YaBrowser/24.1.0.0 Safari/537.36",
+      "Sec-Fetch-Dest": "document",
+      "Cache-Control": "max-age=0"
+    },
+    "total_articles_to_find_and_parse": 10,
+    "encoding": "utf-8",
+    "timeout": 10,
     "should_verify_certificate": true,
     "headless_mode": true
-}
+}
diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json
@@ -1,3 +1,3 @@
 {
-  "target_score": 0
+  "target_score": 6
 }