Skip to content

Commit

Permalink
* refactor(product_details.py): replace handle_popups function with s…
Browse files Browse the repository at this point in the history
…etup_page_handlers function

* refactor(product_urls.py): add print statements for scraping and waiting for page load
* refactor(page_handler.py): add print statements for setting up page handlers and waiting for page load
* refactor(page_handler.py): add route blocking for specific endpoints and set extra http headers
* refactor(popup_handler.py): add print statement for handling popups
  • Loading branch information
DanielWTE committed Jan 1, 2025
1 parent 2a6d0bd commit aed8fdb
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 14 deletions.
4 changes: 2 additions & 2 deletions scraper/product_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import glob
from typing import List, Dict
from utils.browser_config import get_browser_context
from utils.popup_handler import handle_popups
from utils.page_handler import setup_page_handlers
from utils.validator import validate_url
from utils.captcha_monitor import with_captcha_check, monitor_for_captcha, handle_captcha_interaction, CaptchaDetected

Expand Down Expand Up @@ -35,7 +35,6 @@ def process_image_url(image_url: str) -> str:
def navigate_to_product(page, url: str, delay: int = 2):
"""Navigate to a product page with captcha checking"""
page.goto(url)
handle_popups(page)
time.sleep(delay)

@with_captcha_check
Expand Down Expand Up @@ -146,6 +145,7 @@ def extract_product_details():

try:
page = context.new_page()
setup_page_handlers(page)

# Initial navigation
navigate_to_product(page, initial_url)
Expand Down
7 changes: 5 additions & 2 deletions scraper/product_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def click_next_page(page):
next_button = page.locator('.sui-pagination__next')
if next_button.is_visible(timeout=5000):
next_button.click(timeout=5000)
time.sleep(2)
print("[INFO] Navigating to next page...")
return True
except Exception as e:
click.secho(f"\nError navigating to next page: {str(e)}", fg="yellow")
Expand Down Expand Up @@ -84,6 +84,7 @@ def collect_product_urls():

with click.progressbar(range(1, total_pages + 1), label='Scraping pages') as page_numbers:
for page_num in page_numbers:
print(f"\nScraping page {page_num} of {total_pages}...")
try:
# Scrape URLs from current page
page_urls = scrape_category_page(page, current_domain)
Expand All @@ -102,7 +103,9 @@ def collect_product_urls():
if not click_next_page(page):
click.secho("\nFailed to navigate to next page. Stopping scraper.", fg="red")
break
page.wait_for_load_state('networkidle')
print("[INFO] Waiting for page to load...")
page.mouse.wheel(0, 1500)
time.sleep(3)

except CaptchaDetected:
# Captcha was detected but not resolved
Expand Down
30 changes: 25 additions & 5 deletions utils/page_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime

def setup_page_handlers(page: Page):
print("[INFO] Setting up page handlers...")
try:
os.makedirs("logs", exist_ok=True)
log_file = "logs/risk_requests.log"
Expand All @@ -14,7 +15,7 @@ def log_to_file(message):
f.write(f"[{timestamp}] {message}\n")
except Exception as e:
print(f"Error writing to log file: {e}")

def handle_response(response):
if "risk" in response.url:
try:
Expand All @@ -30,17 +31,35 @@ def handle_response(response):
log_to_file(log_message)
except Exception as e:
log_to_file(f"Error logging response: {e}")

page.on("request", lambda request: log_to_file(f">> {request.method} {request.url} Headers: {request.headers} Post Data: {request.post_data}") if "risk" in request.url else None)
page.on("response", handle_response)

blocked_endpoints = [
'**/api/coupon/**',
'**/api/discount/**',
'**/api/promotion/**',
'**/api/popup/**',
'**/api/dialog/**',
'**/she_dist/images/coupon_package_v2/**',
'**/she_dist/assets/coupon-module**',
'**/coupon-dialog/**',
'**sheinsz.ltwebstatic.com/she_dist/libs/risk_libs/fm.us**',
'**pinterest.com/**',
'**/pixel?**',
]


for endpoint in blocked_endpoints:
page.route(endpoint, lambda route, request: route.abort())

page.set_extra_http_headers({
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"accept-language": "en-US,en;q=0.9",
"sec-ch-ua": '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
Expand All @@ -50,5 +69,6 @@ def handle_response(response):
{"name": "sessionID_shein", "value": "", "domain": ".shein.com", "path": "/"},
]
page.context.add_cookies(cookies)

except Exception as e:
print(f"Logging error", e)
10 changes: 5 additions & 5 deletions utils/popup_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from playwright.sync_api import Page, TimeoutError

def handle_popups(page: Page):
print('\n[INFO] Handling popups...')

def try_click(selector_info):
selector, timeout = selector_info
try:
Expand All @@ -17,15 +19,13 @@ def try_click(selector_info):
('[data-sheinprivacysign5464114245="sign"]', 200),
('text="Lehnen Sie alles ab"', 200),
('[aria-label="schließen"]', 200),
('.dialog-header-v2__close-btn', 200),
('.coupon-dialog__coupon-content .dialog-header-v2__close-btn', 200),
('svg.btn-new', 200),
('[data-v-7833d02c].dialog-header-v2__close-btn', 200),
('[aria-label="Close"]', 200),
('#onetrust-reject-all-handler', 100),
]

for selector, timeout in popup_selectors:
if try_click((selector, timeout)):
time.sleep(0.5)
time.sleep(1)

try:
if page.locator('.coupon-dialog__coupon-content').is_visible():
Expand Down

0 comments on commit aed8fdb

Please sign in to comment.