Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrapper, Maxim Glukhankov - 22FPL1 #100

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 156 additions & 1 deletion lab_5_scrapper/scrapper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,66 @@
"""
Crawler implementation.
"""
import datetime
import json
# pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
import pathlib
import re
import shutil
from typing import Pattern, Union

import requests
from bs4 import BeautifulSoup

from core_utils.article.article import Article
from core_utils.article.io import to_raw
from core_utils.config_dto import ConfigDTO
from core_utils.constants import CRAWLER_CONFIG_PATH, ASSETS_PATH


class IncorrectSeedURLError(Exception):
"""
The seed-url is not appropriate.
"""


class NumberOfArticlesOutOfRangeError(Exception):
"""
Total number of articles is out of range from 1 to 150.
"""


class IncorrectNumberOfArticlesError(Exception):
"""
Total number of articles to parse is not integer.
"""


class IncorrectHeadersError(Exception):
"""
Headers are not in a form of dictionary.
"""


class IncorrectEncodingError(Exception):
"""
Encoding must be specified as a string.
"""


class IncorrectTimeoutError(Exception):
"""
Timeout value must be a positive integer less than 60.
"""



class IncorrectVerifyError(Exception):
"""
Verify certificate value must either be True or False.
"""



class Config:
"""
Expand All @@ -18,6 +74,16 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
Args:
path_to_config (pathlib.Path): Path to configuration.
"""
self.path_to_config = path_to_config
self.config = self._extract_config_content()
self._validate_config_content()
self._seed_urls = self.config.seed_urls
self._headers = self.config.headers
self._num_articles = self.config.total_articles
self._encoding = self.config.encoding
self._timeout = self.config.timeout
self._should_verify_certificate = self.config.should_verify_certificate
self._headless_mode = self.config.headless_mode

def _extract_config_content(self) -> ConfigDTO:
"""
Expand All @@ -26,11 +92,44 @@ def _extract_config_content(self) -> ConfigDTO:
Returns:
ConfigDTO: Config values
"""
with open(self.path_to_config, encoding='utf-8') as file:
config = json.load(file)
return ConfigDTO(seed_urls=config["seed_urls"], headers=config['headers'],
total_articles_to_find_and_parse=config['total_articles_to_find_and_parse'],
encoding=config['encoding'], timeout=config['timeout'],
should_verify_certificate=config['should_verify_certificate'],
headless_mode=config['headless_mode'])

def _validate_config_content(self) -> None:
"""
Ensure configuration parameters are not corrupt.
"""
if not isinstance(self.config.seed_urls, list):
raise IncorrectSeedURLError("seed URL should be a list")
for seed_url in self.config.seed_urls:
if not re.match(r"https?://(www.)?", seed_url):
raise IncorrectSeedURLError("seed URL does not match \
standard pattern 'https?://(www.)?'")

if not isinstance(self.config.total_articles, int) or \
self.config.total_articles <= 0:
raise IncorrectNumberOfArticlesError('total number of articles to parse is not integer')

if not 1 <= self.config.total_articles <= 150:
raise NumberOfArticlesOutOfRangeError('total number of articles is out of range')

if not isinstance(self.config.headers, dict):
raise IncorrectHeadersError('headers are not in a form of dictionary')

if not isinstance(self.config.encoding, str):
raise IncorrectEncodingError('encoding must be specified as a string')

if not isinstance(self.config.timeout, int) or not 0 < self.config.timeout < 60:
raise IncorrectTimeoutError('timeout value must be a positive integer less than 60')

if not isinstance(self.config.should_verify_certificate, bool) or not \
isinstance(self.config.headless_mode, bool):
raise IncorrectVerifyError('verify certificate value must either be True or False')

def get_seed_urls(self) -> list[str]:
"""
Expand All @@ -39,6 +138,7 @@ def get_seed_urls(self) -> list[str]:
Returns:
list[str]: Seed urls
"""
return self._seed_urls

def get_num_articles(self) -> int:
"""
Expand All @@ -47,6 +147,7 @@ def get_num_articles(self) -> int:
Returns:
int: Total number of articles to scrape
"""
return self._num_articles

def get_headers(self) -> dict[str, str]:
"""
Expand All @@ -55,6 +156,7 @@ def get_headers(self) -> dict[str, str]:
Returns:
dict[str, str]: Headers
"""
return self._headers

def get_encoding(self) -> str:
"""
Expand All @@ -63,6 +165,7 @@ def get_encoding(self) -> str:
Returns:
str: Encoding
"""
return self._encoding

def get_timeout(self) -> int:
"""
Expand All @@ -71,6 +174,7 @@ def get_timeout(self) -> int:
Returns:
int: Number of seconds to wait for response
"""
return self._timeout

def get_verify_certificate(self) -> bool:
"""
Expand All @@ -79,6 +183,7 @@ def get_verify_certificate(self) -> bool:
Returns:
bool: Whether to verify certificate or not
"""
return self._should_verify_certificate

def get_headless_mode(self) -> bool:
"""
Expand All @@ -87,6 +192,7 @@ def get_headless_mode(self) -> bool:
Returns:
bool: Whether to use headless mode or not
"""
return self._headless_mode


def make_request(url: str, config: Config) -> requests.models.Response:
Expand All @@ -100,6 +206,10 @@ def make_request(url: str, config: Config) -> requests.models.Response:
Returns:
requests.models.Response: A response from a request
"""
return requests.get(url=url,
headers=config.get_headers(),
timeout=config.get_timeout(),
verify=config.get_verify_certificate())


class Crawler:
Expand All @@ -116,6 +226,9 @@ def __init__(self, config: Config) -> None:
Args:
config (Config): Configuration
"""
self.config = config
self.urls = []
self.url_pattern = 'https://volga.news'

def _extract_url(self, article_bs: BeautifulSoup) -> str:
"""
Expand All @@ -127,11 +240,23 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
Returns:
str: Url from HTML
"""
url = ''
for div in article_bs.find_all('div', class_="b-news-item__title"):
url = div.find('a').get('href')
return self.url_pattern + url

def find_articles(self) -> None:
"""
Find articles.
"""
while len(self.urls) < self.config.get_num_articles():
for seed in self.get_search_urls():
response = make_request(seed, self.config)
if not response.ok:
continue
article_bs = BeautifulSoup(response.text, 'html.parser')
url = self._extract_url(article_bs)
self.urls.append(url)

def get_search_urls(self) -> list:
"""
Expand All @@ -140,6 +265,7 @@ def get_search_urls(self) -> list:
Returns:
list: seed_urls param
"""
return self.config.get_seed_urls()


# 10
Expand All @@ -160,6 +286,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
article_id (int): Article id
config (Config): Configuration
"""
self.full_url = full_url
self.article_id = article_id
self.config = config
self.article = Article(self.full_url, self.article_id)

def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -168,6 +298,15 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
article_text = []
intro_block = article_soup.find('p', class_='b-article__intro')
text_blocks = article_soup.find_all('div', class_='b-article__text js-mediator-article')
article_text.append(intro_block.text.strip())
for el in text_blocks:
article_text.append(el.text.strip())
text = ''.join(article_text)
text = re.sub(r'\s+', ' ', text)
self.article.text = text

def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
"""
Expand Down Expand Up @@ -195,6 +334,10 @@ def parse(self) -> Union[Article, bool, list]:
Returns:
Union[Article, bool, list]: Article instance
"""
response = make_request(self.full_url, self.config)
article_bs = BeautifulSoup(response.text, 'html.parser')
self._fill_article_with_text(article_bs)
return self.article


def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Expand All @@ -204,13 +347,25 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Args:
base_path (Union[pathlib.Path, str]): Path where articles stores
"""
if base_path.exists():
shutil.rmtree(base_path.parent)
base_path.mkdir(parents=True)


def main() -> None:
"""
Entrypoint for scrapper module.
"""
configuration = Config(path_to_config=CRAWLER_CONFIG_PATH)
prepare_environment(ASSETS_PATH)
crawler = Crawler(config=configuration)
crawler.find_articles()
for ind, full_url in enumerate(crawler.urls):
parser = HTMLParser(full_url=full_url, article_id=ind + 1, config=configuration)
article = parser.parse()
if isinstance(article, Article):
to_raw(article)


if __name__ == "__main__":
main()
main()
18 changes: 13 additions & 5 deletions lab_5_scrapper/scrapper_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
{
"seed_urls": [],
"headers": {},
"total_articles_to_find_and_parse": 0,
"encoding": "",
"timeout": 0,
"seed_urls": ["https://volga.news/politics",
"https://volga.news/economics",
"https://volga.news/society"
],
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
"Cookie": "stg_returning_visitor=Mon%2C%2008%20Apr%202024%2009:27:59%20GMT; stg_traffic_source_priority=1; _ga_KLCW8G3CY6=GS1.1.1727293350.1.0.1727293350.0.0.0; _ga=GA1.1.1990264309.1727293351; stg_last_interaction=Wed%2C%2025%20Sep%202024%2019:42:38%20GMT"
},
"total_articles_to_find_and_parse": 5,
"encoding": "utf-8",
"timeout": 15,
"should_verify_certificate": true,
"headless_mode": true
}
2 changes: 1 addition & 1 deletion lab_5_scrapper/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"target_score": 0
"target_score": 4
}
Loading
Loading