Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline, Ekaterina Nushkina - 22FPL2 #82

Closed
wants to merge 37 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f346f05
my first commit
eanushkina Apr 1, 2024
5efb1c4
update
Vasilisa-Blyudova Apr 7, 2024
481f87f
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 15, 2024
ee2202c
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 16, 2024
d505ab8
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 22, 2024
0913594
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 22, 2024
342c3d4
stage 1 is completed
eanushkina Apr 23, 2024
6899753
some changes
eanushkina Apr 23, 2024
e83e111
changes
eanushkina Apr 24, 2024
c76b354
some changes
eanushkina Apr 25, 2024
19f5508
some changes
eanushkina Apr 25, 2024
dcbe2a3
fix imports order
eanushkina Apr 26, 2024
6f0c829
changes
eanushkina Apr 26, 2024
5e0c110
please work
eanushkina Apr 26, 2024
fc5dab1
changes
eanushkina Apr 27, 2024
80d4013
for 4
eanushkina Apr 28, 2024
04cc91e
for 4
eanushkina Apr 28, 2024
879c84c
change
eanushkina Apr 28, 2024
7353476
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 29, 2024
6412bf8
some changes
eanushkina Apr 29, 2024
78bf9d1
changes
eanushkina Apr 29, 2024
1ac0768
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 30, 2024
f2c44c0
for 6
eanushkina May 6, 2024
375f8e1
for 6
eanushkina May 6, 2024
8e90690
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 8, 2024
c023cc2
for 6
eanushkina May 10, 2024
20fe3f8
for 6
eanushkina May 10, 2024
224c79f
for 8
eanushkina May 11, 2024
8e582ba
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 12, 2024
29dfb7e
for 8?
eanushkina May 13, 2024
68ea6fe
for 6
eanushkina May 19, 2024
977519e
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 20, 2024
3657e37
changes
eanushkina May 22, 2024
ff41928
changes
eanushkina May 23, 2024
0ed5baf
changes
eanushkina May 24, 2024
40f0843
fixing
eanushkina May 25, 2024
65cac7e
Update
Vasilisa-Blyudova Jun 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions lab_5_scrapper/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,50 @@
Crawler implementation.
"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
import datetime
import json
import pathlib
import random
import re
import time
from typing import Pattern, Union

import requests
from bs4 import BeautifulSoup

from core_utils.article.article import Article
from core_utils.article.io import to_meta, to_raw
from core_utils.config_dto import ConfigDTO
from core_utils.constants import ASSETS_PATH, CRAWLER_CONFIG_PATH


class IncorrectSeedURLError(Exception):
pass


class NumberOfArticlesOutOfRangeError(Exception):
pass


class IncorrectNumberOfArticlesError(Exception):
pass


class IncorrectHeadersError(Exception):
pass


class IncorrectEncodingError(Exception):
pass


class IncorrectTimeoutError(Exception):
pass


class IncorrectVerifyError(Exception):
pass


class Config:
"""
Expand All @@ -19,6 +60,18 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
path_to_config (pathlib.Path): Path to configuration.
"""

self.path_to_config = path_to_config
self._validate_config_content()
self.config_dto = self._extract_config_content()

self._seed_urls = self.config_dto.seed_urls
self._num_articles = self.config_dto.total_articles
self._headers = self.config_dto.headers
self._encoding = self.config_dto.encoding
self._timeout = self.config_dto.timeout
self._should_verify_certificate = self.config_dto.should_verify_certificate
self._headless_mode = self.config_dto.headless_mode

def _extract_config_content(self) -> ConfigDTO:
"""
Get config values.
Expand All @@ -27,10 +80,52 @@ def _extract_config_content(self) -> ConfigDTO:
ConfigDTO: Config values
"""

with open(self.path_to_config, 'r', encoding='utf-8') as f:
config = json.load(f)

return ConfigDTO(
config["seed_urls"],
config["total_articles_to_find_and_parse"],
config["headers"],
config["encoding"],
config["timeout"],
config["should_verify_certificate"],
config["headless_mode"],
)

def _validate_config_content(self) -> None:
"""
Ensure configuration parameters are not corrupt.
"""
with open(self.path_to_config, 'r', encoding='utf-8') as f:
config = json.load(f)

if not isinstance(config['seed_urls'], list):
raise IncorrectSeedURLError

if not (isinstance(config['seed_urls'], list)
and all(re.match(r'https?://(www.)?', seed_url) for seed_url in config['seed_urls'])):
raise IncorrectSeedURLError

if (not isinstance(config['total_articles_to_find_and_parse'], int) or
config['total_articles_to_find_and_parse'] <= 0):
raise IncorrectNumberOfArticlesError

if not 1 < config['total_articles_to_find_and_parse'] <= 150:
raise NumberOfArticlesOutOfRangeError

if not isinstance(config['headers'], dict):
raise IncorrectHeadersError

if not isinstance(config['encoding'], str):
raise IncorrectEncodingError

if not isinstance(config['timeout'], int) or not 0 < config['timeout'] < 60:
raise IncorrectTimeoutError

if (not isinstance(config['should_verify_certificate'], bool) or
not isinstance(config['headless_mode'], bool)):
raise IncorrectVerifyError

def get_seed_urls(self) -> list[str]:
"""
Expand All @@ -39,6 +134,7 @@ def get_seed_urls(self) -> list[str]:
Returns:
list[str]: Seed urls
"""
return self._seed_urls

def get_num_articles(self) -> int:
"""
Expand All @@ -47,6 +143,7 @@ def get_num_articles(self) -> int:
Returns:
int: Total number of articles to scrape
"""
return self._num_articles

def get_headers(self) -> dict[str, str]:
"""
Expand All @@ -55,6 +152,7 @@ def get_headers(self) -> dict[str, str]:
Returns:
dict[str, str]: Headers
"""
return self._headers

def get_encoding(self) -> str:
"""
Expand All @@ -63,6 +161,7 @@ def get_encoding(self) -> str:
Returns:
str: Encoding
"""
return self._encoding

def get_timeout(self) -> int:
"""
Expand All @@ -71,6 +170,7 @@ def get_timeout(self) -> int:
Returns:
int: Number of seconds to wait for response
"""
return self._timeout

def get_verify_certificate(self) -> bool:
"""
Expand All @@ -79,6 +179,7 @@ def get_verify_certificate(self) -> bool:
Returns:
bool: Whether to verify certificate or not
"""
return self._should_verify_certificate

def get_headless_mode(self) -> bool:
"""
Expand All @@ -87,6 +188,7 @@ def get_headless_mode(self) -> bool:
Returns:
bool: Whether to use headless mode or not
"""
return self._headless_mode


def make_request(url: str, config: Config) -> requests.models.Response:
Expand All @@ -100,6 +202,13 @@ def make_request(url: str, config: Config) -> requests.models.Response:
Returns:
requests.models.Response: A response from a request
"""
periods = random.randrange(3)
time.sleep(periods)

response = requests.get(url=url, headers=config.get_headers(),
timeout=config.get_timeout(),
verify=config.get_verify_certificate())
return response


class Crawler:
Expand All @@ -116,6 +225,9 @@ def __init__(self, config: Config) -> None:
Args:
config (Config): Configuration
"""
self.urls = []
self.config = config
self.url_pattern = 'https://www.comnews.ru/'

def _extract_url(self, article_bs: BeautifulSoup) -> str:
"""
Expand All @@ -128,10 +240,37 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
str: Url from HTML
"""

links = article_bs.find(name='div', class_='region region-content')
for link in links.find_all('a'):
if link.get('href').startswith("/content/"):
url = self.url_pattern + link.get('href')
if url not in self.urls:
self.urls.append(url)
return url

def find_articles(self) -> None:
"""
Find articles.
"""
seed_urls = self.get_search_urls()
n_len = self.config.get_num_articles()

while len(self.urls) < n_len:

for seed_url in seed_urls:
response = make_request(seed_url, self.config)
if not response.ok:
continue

soup = BeautifulSoup(response.text, features='html.parser')

new_url = self._extract_url(soup)
if len(self.urls) >= n_len:
break
self.urls.append(new_url)

if len(self.urls) >= n_len:
break

def get_search_urls(self) -> list:
"""
Expand All @@ -140,6 +279,7 @@ def get_search_urls(self) -> list:
Returns:
list: seed_urls param
"""
return self.config.get_seed_urls()


# 10
Expand All @@ -160,6 +300,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
article_id (int): Article id
config (Config): Configuration
"""
self.full_url = full_url
self.article_id = article_id
self.config = config
self.article = Article(self.full_url, self.article_id)

def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -168,6 +312,11 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
texts = []
text_paragraphs = article_soup.find_all(class_="field field-text full-html field-name-body")
for paragraph in text_paragraphs:
texts.append(paragraph.text)
self.article.text = ''.join(texts)

def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -176,6 +325,28 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
self.article.title = article_soup.find('h1').text

date_str = article_soup.find(class_='field field-text field-name-date')
if date_str:
self.article.date = self.unify_date_format(date_str.text)

topics = article_soup.find_all(class_='tags')
if topics:
for topic in topics:
tag = topic.find('a').text
self.article.topics.append(tag)
else:
self.article.topics.append("NOT FOUND")

self.article.author = []
authors = article_soup.find_all(class_="field field-text field-multiple person field-name-authors")
if authors:
for author in authors:
tmp = author.find('span').text.split(' ')[-1]
self.article.author.append(tmp)
else:
self.article.author.append("NOT FOUND")

def unify_date_format(self, date_str: str) -> datetime.datetime:
"""
Expand All @@ -187,6 +358,7 @@ def unify_date_format(self, date_str: str) -> datetime.datetime:
Returns:
datetime.datetime: Datetime object
"""
return datetime.datetime.strptime(date_str, '%d.%m.%Y')

def parse(self) -> Union[Article, bool, list]:
"""
Expand All @@ -195,6 +367,13 @@ def parse(self) -> Union[Article, bool, list]:
Returns:
Union[Article, bool, list]: Article instance
"""
response = make_request(self.full_url, self.config)
if response.ok:
article_bs = BeautifulSoup(response.text, 'html.parser')
self._fill_article_with_text(article_bs)
self._fill_article_with_meta_information(article_bs)

return self.article


def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Expand All @@ -204,12 +383,27 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Args:
base_path (Union[pathlib.Path, str]): Path where articles stores
"""
if not base_path.exists():
base_path.mkdir(parents=True, exist_ok=True)
else:
for file in base_path.iterdir():
file.unlink()


def main() -> None:
"""
Entrypoint for scrapper module.
"""
conf = Config(CRAWLER_CONFIG_PATH)
crawler = Crawler(conf)
crawler.find_articles()
prepare_environment(ASSETS_PATH)

for i, url in enumerate(crawler.urls, 1):
parser = HTMLParser(url, i, conf)
article = parser.parse()
to_raw(article)
to_meta(article)


if __name__ == "__main__":
Expand Down
24 changes: 18 additions & 6 deletions lab_5_scrapper/scrapper_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
{
"seed_urls": [],
"headers": {},
"total_articles_to_find_and_parse": 0,
"encoding": "",
"timeout": 0,
"seed_urls": [
"https://www.comnews.ru/news?page=1",
"https://www.comnews.ru/news?page=2",
"https://www.comnews.ru/news?page=3",
"https://www.comnews.ru/news?page=4"
],
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Cookie": "_ym_uid=171221515114257002; _ym_d=1712215151; _gid=GA1.2.228582562.1713528706; _ym_isad=2; _ym_visorc=w; _ga_FCML9SH5CC=GS1.1.1713528709.18.1.1713528921.59.0.0; _ga=GA1.1.1446361375.1712215151; _ga_3HVRJM3KS4=GS1.1.1713528713.18.1.1713528922.58.0.0",
"Accept-Language": "ru,en;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 YaBrowser/24.1.0.0 Safari/537.36",
"Sec-Fetch-Dest": "document",
"Cache-Control": "max-age=0"
},
"total_articles_to_find_and_parse": 10,
"encoding": "utf-8",
"timeout": 10,
"should_verify_certificate": true,
"headless_mode": true
}
}
2 changes: 1 addition & 1 deletion lab_5_scrapper/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"target_score": 0
"target_score": 6
}
Loading
Loading