* refactor(product_details.py): replace handle_popups function with s…

…etup_page_handlers function * refactor(product_urls.py): add print statements for scraping and waiting for page load * refactor(page_handler.py): add print statements for setting up page handlers and waiting for page load * refactor(page_handler.py): add route blocking for specific endpoints and set extra http headers * refactor(popup_handler.py): add print statement for handling popups
DanielWTE · Jan 1, 2025 · aed8fdb · aed8fdb
1 parent 2a6d0bd
commit aed8fdb
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 14 deletions.
diff --git a/scraper/product_details.py b/scraper/product_details.py
@@ -5,7 +5,7 @@
 import glob
 from typing import List, Dict
 from utils.browser_config import get_browser_context
-from utils.popup_handler import handle_popups
+from utils.page_handler import setup_page_handlers
 from utils.validator import validate_url
 from utils.captcha_monitor import with_captcha_check, monitor_for_captcha, handle_captcha_interaction, CaptchaDetected
 
@@ -35,7 +35,6 @@ def process_image_url(image_url: str) -> str:
 def navigate_to_product(page, url: str, delay: int = 2):
     """Navigate to a product page with captcha checking"""
     page.goto(url)
-    handle_popups(page)
     time.sleep(delay)
 
 @with_captcha_check
@@ -146,6 +145,7 @@ def extract_product_details():
 
     try:
         page = context.new_page()
+        setup_page_handlers(page)
 
         # Initial navigation
         navigate_to_product(page, initial_url)

diff --git a/scraper/product_urls.py b/scraper/product_urls.py
@@ -21,7 +21,7 @@ def click_next_page(page):
         next_button = page.locator('.sui-pagination__next')
         if next_button.is_visible(timeout=5000):
             next_button.click(timeout=5000)
-            time.sleep(2)
+            print("[INFO] Navigating to next page...")
             return True
     except Exception as e:
         click.secho(f"\nError navigating to next page: {str(e)}", fg="yellow")
@@ -84,6 +84,7 @@ def collect_product_urls():
 
         with click.progressbar(range(1, total_pages + 1), label='Scraping pages') as page_numbers:
             for page_num in page_numbers:
+                print(f"\nScraping page {page_num} of {total_pages}...")
                 try:
                     # Scrape URLs from current page
                     page_urls = scrape_category_page(page, current_domain)
@@ -102,7 +103,9 @@ def collect_product_urls():
                         if not click_next_page(page):
                             click.secho("\nFailed to navigate to next page. Stopping scraper.", fg="red")
                             break
-                        page.wait_for_load_state('networkidle')
+                        print("[INFO] Waiting for page to load...")
+                        page.mouse.wheel(0, 1500)
+                        time.sleep(3)
 
                 except CaptchaDetected:
                     # Captcha was detected but not resolved

diff --git a/utils/page_handler.py b/utils/page_handler.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 
 def setup_page_handlers(page: Page):
+    print("[INFO] Setting up page handlers...")
     try:
         os.makedirs("logs", exist_ok=True)
         log_file = "logs/risk_requests.log"
@@ -14,7 +15,7 @@ def log_to_file(message):
                     f.write(f"[{timestamp}] {message}\n")
             except Exception as e:
                 print(f"Error writing to log file: {e}")
-
+                
         def handle_response(response):
             if "risk" in response.url:
                 try:
@@ -30,17 +31,35 @@ def handle_response(response):
                     log_to_file(log_message)
                 except Exception as e:
                     log_to_file(f"Error logging response: {e}")
-
+               
         page.on("request", lambda request: log_to_file(f">> {request.method} {request.url} Headers: {request.headers} Post Data: {request.post_data}") if "risk" in request.url else None)
         page.on("response", handle_response)
 
+        blocked_endpoints = [
+            '**/api/coupon/**',
+            '**/api/discount/**', 
+            '**/api/promotion/**',
+            '**/api/popup/**',
+            '**/api/dialog/**',
+            '**/she_dist/images/coupon_package_v2/**',
+            '**/she_dist/assets/coupon-module**',
+            '**/coupon-dialog/**',
+            '**sheinsz.ltwebstatic.com/she_dist/libs/risk_libs/fm.us**',
+            '**pinterest.com/**',
+            '**/pixel?**',
+        ]
+
+
+        for endpoint in blocked_endpoints:
+            page.route(endpoint, lambda route, request: route.abort())
+
         page.set_extra_http_headers({
             "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
-            "accept-language": "en-US,en;q=0.9", 
+            "accept-language": "en-US,en;q=0.9",
             "sec-ch-ua": '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
-            "sec-ch-ua-mobile": "?0",
+            "sec-ch-ua-mobile": "?0", 
             "sec-fetch-dest": "document",
-            "sec-fetch-mode": "navigate",
+            "sec-fetch-mode": "navigate", 
             "sec-fetch-site": "none",
             "sec-fetch-user": "?1",
             "upgrade-insecure-requests": "1"
@@ -50,5 +69,6 @@ def handle_response(response):
             {"name": "sessionID_shein", "value": "", "domain": ".shein.com", "path": "/"},
         ]
         page.context.add_cookies(cookies)
+
     except Exception as e:
         print(f"Logging error", e)
diff --git a/utils/popup_handler.py b/utils/popup_handler.py
@@ -2,6 +2,8 @@
 from playwright.sync_api import Page, TimeoutError
 
 def handle_popups(page: Page):
+    print('\n[INFO] Handling popups...')
+
     def try_click(selector_info):
         selector, timeout = selector_info
         try:
@@ -17,15 +19,13 @@ def try_click(selector_info):
         ('[data-sheinprivacysign5464114245="sign"]', 200),
         ('text="Lehnen Sie alles ab"', 200),
         ('[aria-label="schließen"]', 200),
-        ('.dialog-header-v2__close-btn', 200),
-        ('.coupon-dialog__coupon-content .dialog-header-v2__close-btn', 200),
-        ('svg.btn-new', 200),
-        ('[data-v-7833d02c].dialog-header-v2__close-btn', 200),
+        ('[aria-label="Close"]', 200),
+        ('#onetrust-reject-all-handler', 100),
     ]
 
     for selector, timeout in popup_selectors:
         if try_click((selector, timeout)):
-            time.sleep(0.5)
+            time.sleep(1)
 
     try:
         if page.locator('.coupon-dialog__coupon-content').is_visible():