-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcarousell_scrap.py
122 lines (111 loc) · 5.81 KB
/
carousell_scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys, os
import time
import re
import urllib
from pprint import pprint
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
print(f'Carousell Scraping V1.0 by Billy Cao\nRunning on Python {sys.version}, Selenium {selenium.__version__}, BeautifulSoup {bs4.__version__}')
def request_page(url):
""" Returns BeautifulSoup4 Objects (soup)"""
driver.get(url)
page = 1
timeout = 5
while page < page_limit:
try:
next_page_btn = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//main[1]/div/button[.="Load more"]'))) # wait max timeout sec for loading
driver.execute_script("arguments[0].click();", next_page_btn) # click the load more button through ads
page += 1
except TimeoutException as e:
break
time.sleep(timeout)
print(f'All results loaded. Total: {page} pages.')
return BeautifulSoup(driver.page_source, "html.parser")
def parse_info(item_div, mode=1):
a = item_div.div.find_all('a', recursive=False)
seller_divs = a[0].find_all('div', recursive=False)[1]
item_p = a[1].find_all('p', recursive=False)
if mode == 1:
return {'seller_name': seller_divs.p.get_text(),
'seller_url': home+a[0]['href'],
'item_name': a[1].find_all('div', recursive=False)[1].p.get_text(),
'item_url': home+a[1]['href'],
'time_posted': seller_divs.div.p.get_text(), # TODO: process into absolute datetime
'condition': item_p[1].get_text(),
'price': re.findall(r"\d+", item_p[0].get_text().replace(',', ''))[0]} # 0 is discounted price, 1 is original price, if applicable
else:
return {'seller_name': seller_divs.p.get_text(),
'seller_url': home+a[0]['href'],
'item_name': item_p[0].get_text(),
'item_url': home+a[1]['href'],
'time_posted': seller_divs.div.p.get_text(), # TODO: process into absolute datetime
'condition': item_p[3].get_text(),
'price': re.findall(r"\d+", item_p[1].get_text().replace(',', ''))[0]} # 0 is discounted price, 1 is original price, if applicable
home = 'https://sg.carousell.com'
item = input('Enter item to scrape: ')
page_limit = int(input('Up to how many pages to scrap? Each page is 23-25 listings: '))
extension = f'/search/{urllib.parse.quote(item)}'
opts = Options()
opts.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
driver = webdriver.Chrome(options=opts)
driver.minimize_window()
print(f'Chrome Web Driver loaded. Version: {driver.capabilities["browserVersion"]}\n') # use "version" on Linux
parse_mode = 1 # Carousell have 2 formats of their item divs. See below comment for more info.
tries = 1
while tries < 5: # retrying loop as the div class position is random
try:
print(f'Retrieving search results on {item}...')
search_results_soup = request_page(home+extension)
# TODO: Find concrete way to locate correct class name, current work around works 99% of times.
item_divs_class = ' '.join(search_results_soup.find('main').find('div').find('div').find('div')['class']) # changes randomly but 99% of the time its the first div
print(f'Detected item_divs class: {item_divs_class}')
item_divs = search_results_soup.find('main').find('div').find('div').find_all('div', class_=item_divs_class, recursive=False) # filter out ads divs
print(f'Found {len(item_divs)} listings. Parsing...')
items_list = [parse_info(item_div, parse_mode) for item_div in item_divs]
break
except AttributeError as e: # no item_divs at all
raise RuntimeError('The search has returned no result.')
except IndexError as e:
print(f'Parsing attempt {tries} failed due to class name error using parse mode {parse_mode}. Retrying with parse mode 2...\n')
tries += 1
parse_mode = 2
continue
else:
raise RuntimeError('Parsing failed as it still faces IndexError after 5 tries.')
driver.quit()
print(f'Parse success using mode {parse_mode}! Sample item parsed:')
pprint(items_list[0])
df = pd.DataFrame(items_list)
df.to_csv(f'{item}.csv', index=False)
print(f'Results saved to {item}.csv')
input('Press enter to exit')
'''
Two parse modes only differs in item divs 2nd a
Structure of Carousell HTML FORMAT 1 (parse_mode 1):
body > find main > 1st div > 1st div > divs of items
in divs of items > parents of each item
parent > 1st div > 1st a is seller, 2nd a is item page
in 1st a: 2nd div > p is seller name, > div > p is time posted
in 2nd a: 2nd div > p is item name but with ... if too long, directly under 2nd a first p is price, 2nd p is condition
parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.
Structure of Carousell HTML FORMAT 2 (parse_mode 2):
body > find main > 1st div > 1st div > divs of items
in divs of items > parents of each item
parent > 1st div > 1st a is seller, 2nd a is item page
in 1st a: 2nd div > p is seller name, > div > p is time posted
in 2nd a: 1st p is FULl NAME, 2nd p is price, 3rd p is description, 4th p is condition
parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.
body > find main > div > button to view more
view more button loads on top of existing, so can prob spam view more then gather all items at once
MAY NOT BE FIRST DIV! Temp workaround is to get class name of the correct item divs
'''