Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/ES2-UFPI/WiseBuilder into or…
Browse files Browse the repository at this point in the history
…questracao-de-scraping-#8
  • Loading branch information
wesleyvitor11000 committed Mar 2, 2023
2 parents d96ad97 + 984220a commit dd12455
Show file tree
Hide file tree
Showing 5 changed files with 544 additions and 282 deletions.
Binary file added res/data/raw/components.zip
Binary file not shown.
72 changes: 72 additions & 0 deletions src/Scraper/domain/scrapers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from dataclasses import dataclass, field
from Scraper.domain.value_object import AbstractScraper
from framework.domain.value_object import URL, Money
from bs4 import BeautifulSoup
import requests as rq


@dataclass(init=False)
class KabumScraper(AbstractScraper):
raw_url: str = "https://www.kabum.com.br"
query_string: str = (
"?page_number={page_number}&page_size=100&facet_filters=&sort=most_searched"
)

def get_volatile_data(
self, url: str
) -> tuple[URL | None, list[tuple[URL, str, Money, int]]]:
headers = {
"User-Agent": "Mozilla/5.0",
}

html = rq.get(url, headers=headers).content
soup = BeautifulSoup(html, "html.parser")

prices = [
Money(
float(
element.string.split("\u00A0")[-1]
.replace(".", "")
.replace(",", ".")
)
)
if not element.string.split("\u00A0")[-1] == "---"
else Money(-1)
for element in soup.select("span.sc-3b515ca1-2")
]

availability = [
False if element.string.split("\u00A0")[-1] == "---" else True
for element in soup.select("span.sc-3b515ca1-2")
]

names = [
element.string
for element in soup.select(
"span.sc-d99ca57-0.cpPIRA.sc-ff8a9791-16.dubjqF.nameCard"
)
]

links = [
URL.get_URL(str(self.raw_url + element["href"]))
for element in soup.select("a.sc-ff8a9791-10.htpbqG")
]

volatile_data = []
for link, name, price, availability in zip(links, names, prices, availability):
volatile_data.append(tuple([link, name, price, availability]))

number_of_pages = [int(element.string) for element in soup.select("a.page")]

n_actual_page = int(soup.select_one("a.page.active").string)

n_next_page = n_actual_page + 1

if n_next_page in number_of_pages:
next_page = url.split("?")[0] + self.query_string.format(
page_number=n_next_page
)
else:
next_page = None

return next_page, volatile_data
Empty file added src/Scraper/tests/__init__.py
Empty file.
62 changes: 62 additions & 0 deletions src/Scraper/tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest
import requests as rq
from Scraper.domain.scrapers import KabumScraper
from framework.domain.value_object import Money, URL


@pytest.fixture()
def get_data_with_unvailable():
kabum_scraper = KabumScraper()
url = "https://www.kabum.com.br/hardware/ssd-2-5?page_number=8&page_size=100&facet_filters=&sort=most_searched"
return kabum_scraper.get_volatile_data(url)


@pytest.mark.unit
def test_scraper_unvailable(get_data_with_unvailable):
next_page, data = get_data_with_unvailable
assert (
next_page
== "https://www.kabum.com.br/hardware/ssd-2-5?page_number=9&page_size=100&facet_filters=&sort=most_searched"
)

available_product = data[0]
unavailable_product = [info for info in data if not info[-1]]

test_available_product = (
URL.get_URL(
"https://www.kabum.com.br/produto/196886/ssd-imation-2-5-sata-iii-a320-240gb"
),
"Ssd Imation 2.5 Sata Iii - A320 240gb",
Money(485.99),
True,
)

for data_scraper, data_test in zip(available_product, test_available_product):
assert data_scraper == data_test

test_unavailable_product = (
URL.get_URL(
"https://www.kabum.com.br/produto/314763/ssd-hikvision-240gb-2-5-sata-3-hs-ssd-c100-240g"
),
'Ssd Hikvision 240gb 2,5" Sata 3 - Hs-ssd-c100/240g',
Money(-1),
False,
)

for data_scraper, data_test in zip(
unavailable_product[0], test_unavailable_product
):
assert data_scraper == data_test


@pytest.fixture()
def get_final_page():
kabum_scraper = KabumScraper()
url = "https://www.kabum.com.br/hardware/ssd-2-5?page_number=17&page_size=100&facet_filters=&sort=most_searched"
return kabum_scraper.get_volatile_data(url)


@pytest.mark.unit
def test_final_page(get_final_page):
next_page, data = get_final_page
assert next_page is None
Loading

0 comments on commit dd12455

Please sign in to comment.