-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsupreme_scraper.py
132 lines (101 loc) · 3.4 KB
/
supreme_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as soup
debug = 0
url = "https://www.supremenewyork.com/shop"
url2 = "https://www.supremenewyork.com/shop/all"
#RELEASE_DATE = datetime.datetime.now().strftime("%m/%d/%Y")
# Site Specific
def get_page(query=""):
client = urlopen(url + query)
html_page = client.read()
client.close()
return soup(html_page, "html.parser")
# release variable should determine whether to fetch all items or new releases
# If 1 then only return those items
# this will occur on one thread, might be slower than using many threads and skipping
def get_assortment(parsed_page, release=0):
shop_list = parsed_page.find("ul", {"id": "shop-scroller"})
items = []
for i in shop_list.find_all('li'):
if(release == 1 and get_tag(i) != None):
items.append(i)
elif(release == 0):
items.append(i)
return items
def get_query(item):
return "/" + item.a['href'].split("/", 2)[2]
# Item specific
def get_item_id(query):
return query.split("/", 2)[2]
def get_type(item):
return "".join(item['class'])
def get_tag(item):
if(item.span == None):
return None
else:
return "".join(item.span['class'])
# Product Specific
def get_product_title(product):
return product.find('title').text
def get_product_details(product):
d = product.find("div", {"id": "details"})
return d.find("p", {"itemprop": "description"}).text
def get_product_color(product):
d = product.find("div", {"id": "details"})
return d.find("p", {"itemprop": "model"}).text
def get_product_release_date(product):
d = product.find("div", {"id": "details"})
return d.h1['data-rd']
def parsed_items(items):
item_dict = {}
count = 0
for i in items:
# Aux
print("Parsing:", count, "out of", len(items))
count += 1
curr_query = get_query(i)
curr_product = get_page(curr_query)
curr_title = get_product_title(curr_product)
curr_desc = get_product_details(curr_product)
curr_color = get_product_color(curr_product)
curr_rd = get_product_release_date(curr_product)
curr_type = get_type(i)
curr_id = get_item_id(curr_query)
curr_tag = get_tag(i)
item_dict[curr_id] = [curr_title, curr_desc, curr_type, curr_color, curr_tag, curr_rd]
return item_dict
# TODO
def write_items(item_dict):
filename = "Supreme.xlsx"
print("Writing to {filename}...")
f = open(filename, "w")
headers = "ID, title, desc, type, color, tag, release_date\n"
f.write(headers)
for key, value in item_dict.items():
curr_row = key
for val in value:
if val is None:
curr_row += ", "
else:
curr_row += "," + str(val.replace(",", "|").replace("\n", "|"))
if val is value[-1]:
curr_row += "\n"
f.write(curr_row)
print("Writing complete, closing file...")
f.close()
"""
def main():
parsed_page = get_page()
items = get_assortment(parsed_page)
if not debug:
item_dict = parsed_items(items)
write_items(item_dict)
else:
q = get_query(items[0])
p = get_page(q)
d = get_product_details(p)
c = get_product_color(p)
rd = get_product_release_date(p)
#print(d)
main()
"""