diff --git a/scraper/product_details.py b/scraper/product_details.py index 66a6786..53b389b 100644 --- a/scraper/product_details.py +++ b/scraper/product_details.py @@ -5,7 +5,7 @@ import glob from typing import List, Dict from utils.browser_config import get_browser_context -from utils.popup_handler import handle_popups +from utils.page_handler import setup_page_handlers from utils.validator import validate_url from utils.captcha_monitor import with_captcha_check, monitor_for_captcha, handle_captcha_interaction, CaptchaDetected @@ -35,7 +35,6 @@ def process_image_url(image_url: str) -> str: def navigate_to_product(page, url: str, delay: int = 2): """Navigate to a product page with captcha checking""" page.goto(url) - handle_popups(page) time.sleep(delay) @with_captcha_check @@ -146,6 +145,7 @@ def extract_product_details(): try: page = context.new_page() + setup_page_handlers(page) # Initial navigation navigate_to_product(page, initial_url) diff --git a/scraper/product_urls.py b/scraper/product_urls.py index b9d97ba..c51e823 100644 --- a/scraper/product_urls.py +++ b/scraper/product_urls.py @@ -21,7 +21,7 @@ def click_next_page(page): next_button = page.locator('.sui-pagination__next') if next_button.is_visible(timeout=5000): next_button.click(timeout=5000) - time.sleep(2) + print("[INFO] Navigating to next page...") return True except Exception as e: click.secho(f"\nError navigating to next page: {str(e)}", fg="yellow") @@ -84,6 +84,7 @@ def collect_product_urls(): with click.progressbar(range(1, total_pages + 1), label='Scraping pages') as page_numbers: for page_num in page_numbers: + print(f"\nScraping page {page_num} of {total_pages}...") try: # Scrape URLs from current page page_urls = scrape_category_page(page, current_domain) @@ -102,7 +103,9 @@ def collect_product_urls(): if not click_next_page(page): click.secho("\nFailed to navigate to next page. Stopping scraper.", fg="red") break - page.wait_for_load_state('networkidle') + print("[INFO] Waiting for page to load...") + page.mouse.wheel(0, 1500) + time.sleep(3) except CaptchaDetected: # Captcha was detected but not resolved diff --git a/utils/page_handler.py b/utils/page_handler.py index d6d2193..1268b1f 100644 --- a/utils/page_handler.py +++ b/utils/page_handler.py @@ -3,6 +3,7 @@ from datetime import datetime def setup_page_handlers(page: Page): + print("[INFO] Setting up page handlers...") try: os.makedirs("logs", exist_ok=True) log_file = "logs/risk_requests.log" @@ -14,7 +15,7 @@ def log_to_file(message): f.write(f"[{timestamp}] {message}\n") except Exception as e: print(f"Error writing to log file: {e}") - + def handle_response(response): if "risk" in response.url: try: @@ -30,17 +31,35 @@ def handle_response(response): log_to_file(log_message) except Exception as e: log_to_file(f"Error logging response: {e}") - + page.on("request", lambda request: log_to_file(f">> {request.method} {request.url} Headers: {request.headers} Post Data: {request.post_data}") if "risk" in request.url else None) page.on("response", handle_response) + blocked_endpoints = [ + '**/api/coupon/**', + '**/api/discount/**', + '**/api/promotion/**', + '**/api/popup/**', + '**/api/dialog/**', + '**/she_dist/images/coupon_package_v2/**', + '**/she_dist/assets/coupon-module**', + '**/coupon-dialog/**', + '**sheinsz.ltwebstatic.com/she_dist/libs/risk_libs/fm.us**', + '**pinterest.com/**', + '**/pixel?**', + ] + + + for endpoint in blocked_endpoints: + page.route(endpoint, lambda route, request: route.abort()) + page.set_extra_http_headers({ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", - "accept-language": "en-US,en;q=0.9", + "accept-language": "en-US,en;q=0.9", "sec-ch-ua": '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', - "sec-ch-ua-mobile": "?0", + "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", - "sec-fetch-mode": "navigate", + "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1" @@ -50,5 +69,6 @@ def handle_response(response): {"name": "sessionID_shein", "value": "", "domain": ".shein.com", "path": "/"}, ] page.context.add_cookies(cookies) + except Exception as e: print(f"Logging error", e) \ No newline at end of file diff --git a/utils/popup_handler.py b/utils/popup_handler.py index 13081bd..26cfc57 100644 --- a/utils/popup_handler.py +++ b/utils/popup_handler.py @@ -2,6 +2,8 @@ from playwright.sync_api import Page, TimeoutError def handle_popups(page: Page): + print('\n[INFO] Handling popups...') + def try_click(selector_info): selector, timeout = selector_info try: @@ -17,15 +19,13 @@ def try_click(selector_info): ('[data-sheinprivacysign5464114245="sign"]', 200), ('text="Lehnen Sie alles ab"', 200), ('[aria-label="schließen"]', 200), - ('.dialog-header-v2__close-btn', 200), - ('.coupon-dialog__coupon-content .dialog-header-v2__close-btn', 200), - ('svg.btn-new', 200), - ('[data-v-7833d02c].dialog-header-v2__close-btn', 200), + ('[aria-label="Close"]', 200), + ('#onetrust-reject-all-handler', 100), ] for selector, timeout in popup_selectors: if try_click((selector, timeout)): - time.sleep(0.5) + time.sleep(1) try: if page.locator('.coupon-dialog__coupon-content').is_visible():