fipl-hse · glukhankov · Apr 7, 2024 · Apr 25, 2024 · Apr 29, 2024 · Apr 30, 2024
diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py
@@ -1,10 +1,66 @@
 """
 Crawler implementation.
 """
+import datetime
+import json
 # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
 import pathlib
+import re
+import shutil
 from typing import Pattern, Union
 
+import requests
+from bs4 import BeautifulSoup
+
+from core_utils.article.article import Article
+from core_utils.article.io import to_raw
+from core_utils.config_dto import ConfigDTO
+from core_utils.constants import CRAWLER_CONFIG_PATH, ASSETS_PATH
+
+
+class IncorrectSeedURLError(Exception):
+    """
+    The seed-url is not appropriate.
+    """
+
+
+class NumberOfArticlesOutOfRangeError(Exception):
+    """
+    Total number of articles is out of range from 1 to 150.
+    """
+
+
+class IncorrectNumberOfArticlesError(Exception):
+    """
+    Total number of articles to parse is not integer.
+    """
+
+
+class IncorrectHeadersError(Exception):
+    """
+    Headers are not in a form of dictionary.
+    """
+
+
+class IncorrectEncodingError(Exception):
+    """
+    Encoding must be specified as a string.
+    """
+
+
+class IncorrectTimeoutError(Exception):
+    """
+    Timeout value must be a positive integer less than 60.
+    """
+
+
+
+class IncorrectVerifyError(Exception):
+    """
+    Verify certificate value must either be True or False.
+    """
+
+
 
 class Config:
     """
@@ -18,6 +74,16 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
         Args:
             path_to_config (pathlib.Path): Path to configuration.
         """
+        self.path_to_config = path_to_config
+        self.config = self._extract_config_content()
+        self._validate_config_content()
+        self._seed_urls = self.config.seed_urls
+        self._headers = self.config.headers
+        self._num_articles = self.config.total_articles
+        self._encoding = self.config.encoding
+        self._timeout = self.config.timeout
+        self._should_verify_certificate = self.config.should_verify_certificate
+        self._headless_mode = self.config.headless_mode
 
     def _extract_config_content(self) -> ConfigDTO:
         """
@@ -26,11 +92,44 @@ def _extract_config_content(self) -> ConfigDTO:
         Returns:
             ConfigDTO: Config values
         """
+        with open(self.path_to_config, encoding='utf-8') as file:
+            config = json.load(file)
+        return ConfigDTO(seed_urls=config["seed_urls"], headers=config['headers'],
+                         total_articles_to_find_and_parse=config['total_articles_to_find_and_parse'],
+                         encoding=config['encoding'], timeout=config['timeout'],
+                         should_verify_certificate=config['should_verify_certificate'],
+                         headless_mode=config['headless_mode'])
 
     def _validate_config_content(self) -> None:
         """
         Ensure configuration parameters are not corrupt.
         """
+        if not isinstance(self.config.seed_urls, list):
+            raise IncorrectSeedURLError("seed URL should be a list")
+        for seed_url in self.config.seed_urls:
+            if not re.match(r"https?://(www.)?", seed_url):
+                raise IncorrectSeedURLError("seed URL does not match \
+                standard pattern 'https?://(www.)?'")
+
+        if not isinstance(self.config.total_articles, int) or \
+                self.config.total_articles <= 0:
+            raise IncorrectNumberOfArticlesError('total number of articles to parse is not integer')
+
+        if not 1 <= self.config.total_articles <= 150:
+            raise NumberOfArticlesOutOfRangeError('total number of articles is out of range')
+
+        if not isinstance(self.config.headers, dict):
+            raise IncorrectHeadersError('headers are not in a form of dictionary')
+
+        if not isinstance(self.config.encoding, str):
+            raise IncorrectEncodingError('encoding must be specified as a string')
+
+        if not isinstance(self.config.timeout, int) or not 0 < self.config.timeout < 60:
+            raise IncorrectTimeoutError('timeout value must be a positive integer less than 60')
+
+        if not isinstance(self.config.should_verify_certificate, bool) or not \
+                isinstance(self.config.headless_mode, bool):
+            raise IncorrectVerifyError('verify certificate value must either be True or False')
 
     def get_seed_urls(self) -> list[str]:
         """
@@ -39,6 +138,7 @@ def get_seed_urls(self) -> list[str]:
         Returns:
             list[str]: Seed urls
         """
+        return self._seed_urls
 
     def get_num_articles(self) -> int:
         """
@@ -47,6 +147,7 @@ def get_num_articles(self) -> int:
         Returns:
             int: Total number of articles to scrape
         """
+        return self._num_articles
 
     def get_headers(self) -> dict[str, str]:
         """
@@ -55,6 +156,7 @@ def get_headers(self) -> dict[str, str]:
         Returns:
             dict[str, str]: Headers
         """
+        return self._headers
 
     def get_encoding(self) -> str:
         """
@@ -63,6 +165,7 @@ def get_encoding(self) -> str:
         Returns:
             str: Encoding
         """
+        return self._encoding
 
     def get_timeout(self) -> int:
         """
@@ -71,6 +174,7 @@ def get_timeout(self) -> int:
         Returns:
             int: Number of seconds to wait for response
         """
+        return self._timeout
 
     def get_verify_certificate(self) -> bool:
         """
@@ -79,6 +183,7 @@ def get_verify_certificate(self) -> bool:
         Returns:
             bool: Whether to verify certificate or not
         """
+        return self._should_verify_certificate
 
     def get_headless_mode(self) -> bool:
         """
@@ -87,6 +192,7 @@ def get_headless_mode(self) -> bool:
         Returns:
             bool: Whether to use headless mode or not
         """
+        return self._headless_mode
 
 
 def make_request(url: str, config: Config) -> requests.models.Response:
@@ -100,6 +206,10 @@ def make_request(url: str, config: Config) -> requests.models.Response:
     Returns:
         requests.models.Response: A response from a request
     """
+    return requests.get(url=url,
+                        headers=config.get_headers(),
+                        timeout=config.get_timeout(),
+                        verify=config.get_verify_certificate())
 
 
 class Crawler:
@@ -116,6 +226,9 @@ def __init__(self, config: Config) -> None:
         Args:
             config (Config): Configuration
         """
+        self.config = config
+        self.urls = []
+        self.url_pattern = 'https://volga.news'
 
     def _extract_url(self, article_bs: BeautifulSoup) -> str:
         """
@@ -127,11 +240,23 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
         Returns:
             str: Url from HTML
         """
+        url = ''
+        for div in article_bs.find_all('div', class_="b-news-item__title"):
+            url = div.find('a').get('href')
+        return self.url_pattern + url
 
     def find_articles(self) -> None:
         """
         Find articles.
         """
+        while len(self.urls) < self.config.get_num_articles():
+            for seed in self.get_search_urls():
+                response = make_request(seed, self.config)
+                if not response.ok:
+                    continue
+                article_bs = BeautifulSoup(response.text, 'html.parser')
+                url = self._extract_url(article_bs)
+                self.urls.append(url)
 
     def get_search_urls(self) -> list:
         """
@@ -140,6 +265,7 @@ def get_search_urls(self) -> list:
         Returns:
             list: seed_urls param
         """
+        return self.config.get_seed_urls()
 
 
 # 10
@@ -160,6 +286,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
             article_id (int): Article id
             config (Config): Configuration
         """
+        self.full_url = full_url
+        self.article_id = article_id
+        self.config = config
+        self.article = Article(self.full_url, self.article_id)
 
     def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         """
@@ -168,6 +298,15 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        article_text = []
+        intro_block = article_soup.find('p', class_='b-article__intro')
+        text_blocks = article_soup.find_all('div', class_='b-article__text js-mediator-article')
+        article_text.append(intro_block.text.strip())
+        for el in text_blocks:
+            article_text.append(el.text.strip())
+        text = ''.join(article_text)
+        text = re.sub(r'\s+', ' ', text)
+        self.article.text = text
 
     def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
         """
@@ -195,6 +334,10 @@ def parse(self) -> Union[Article, bool, list]:
         Returns:
             Union[Article, bool, list]: Article instance
         """
+        response = make_request(self.full_url, self.config)
+        article_bs = BeautifulSoup(response.text, 'html.parser')
+        self._fill_article_with_text(article_bs)
+        return self.article
 
 
 def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
@@ -204,13 +347,25 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
     Args:
         base_path (Union[pathlib.Path, str]): Path where articles stores
     """
+    if base_path.exists():
+        shutil.rmtree(base_path.parent)
+    base_path.mkdir(parents=True)
 
 
 def main() -> None:
     """
     Entrypoint for scrapper module.
     """
+    configuration = Config(path_to_config=CRAWLER_CONFIG_PATH)
+    prepare_environment(ASSETS_PATH)
+    crawler = Crawler(config=configuration)
+    crawler.find_articles()
+    for ind, full_url in enumerate(crawler.urls):
+        parser = HTMLParser(full_url=full_url, article_id=ind + 1, config=configuration)
+        article = parser.parse()
+        if isinstance(article, Article):
+            to_raw(article)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json
@@ -1,9 +1,17 @@
 {
-    "seed_urls": [],
-    "headers": {},
-    "total_articles_to_find_and_parse": 0,
-    "encoding": "",
-    "timeout": 0,
+    "seed_urls": ["https://volga.news/politics",
+        "https://volga.news/economics",
+        "https://volga.news/society"
+    ],
+    "headers": {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+        "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
+        "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
+        "Cookie": "stg_returning_visitor=Mon%2C%2008%20Apr%202024%2009:27:59%20GMT; stg_traffic_source_priority=1; _ga_KLCW8G3CY6=GS1.1.1727293350.1.0.1727293350.0.0.0; _ga=GA1.1.1990264309.1727293351; stg_last_interaction=Wed%2C%2025%20Sep%202024%2019:42:38%20GMT"
+    },
+    "total_articles_to_find_and_parse": 5,
+    "encoding": "utf-8",
+    "timeout": 15,
     "should_verify_certificate": true,
     "headless_mode": true
 }
diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json
@@ -1,3 +1,3 @@
 {
-  "target_score": 0
+  "target_score": 4
 }