This repository has been archived by the owner on Apr 2, 2024. It is now read-only.
forked from GurjotSinghAulakh/Web-Scraper-API
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsofa_v2.py
356 lines (298 loc) · 17.3 KB
/
sofa_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
'''
[static] script will scrape category sofa
and all its undecategories
'''
# todo: legge til flere farger, merker, evt under_under_kategorier "type" i dictionary
import threading # to enable threading
from datetime import datetime # to keep track of which day file was scraped
import requests # to make request (html-request)
from bs4 import BeautifulSoup # to make the html code compact
from openpyxl import Workbook # to create excel sheets
# brand array
all_sofa_brands = ["ikea", "møbelringen", "ekornes", "stordal", "bohus", "milano",
"tiki"]
ikea_sofa_models = ["angby", "backamo", "ekeskog", "ektorp", "goteborg", "harnosand", "hovas", "karlanda", "karlstad",
"kivik", "kramfors", "lillberg", "nikkala", "sandby", "stockholm", "stromstad", "tomelilla",
"tylosand", "klobo", "gronlid", "farlov", "soderhamn", "norsborg", "friheten", "vimle", "strandmon",
"söderhamn"]
bolia_sofa_models = ["scandinavia", "elton", "lomi", "paste", "north", "cloud", "sepia", "hannah", "madison", "grace",
"fuuga", "noora", "cosima", "casia", "cosy", "angel", "jerome", "mr. big", "aya", "orlando",
"recover"]
all_sofa_models = ["angby", "backamo", "ekeskog", "ektorp", "goteborg", "harnosand", "hovas", "karlanda", "karlstad",
"kivik", "kramfors", "lillberg", "nikkala", "sandby", "stockholm", "stromstad", "tomelilla",
"tylosand", "klobo", "gronlid", "farlov", "soderhamn","norsborg", "friheten", "vimle", "scandinavia",
"elton", "lomi", "paste", "north", "cloud", "sepia", "hannah", "madison", "grace", "fuuga",
"noora", "cosima", "casia", "cosy", "angel", "jerome", "mr. big", "aya", "orlando",
"recover", "orlando outdoor", "strandmon", "söderhamn"]
sofa_brand_and_model = [{"brand": "ikea", "model": ikea_sofa_models},
{"brand": "bolia", "model": bolia_sofa_models},
{"brand": "ekornes", "model": [None]},
{"brand": "stordal", "model": [None]},
{"brand": ""}
]
# under categries
# sofa_under_categories = ["2-seter", "3-seter", "hjørnesofaer", "lenestoler", "puffer", "sofagrupper", "sovesofaer"]
# dictionary
sofa_dictionary = [
# The links have filtes:
# privat, til-salgs, kjøp
# and are sorted by the most recent ad
# type is used if an undercategory have also an undercategory
# f.exp. undercategory: kjøleskap, has type: kombiskap, side-by-side and so on
{
"category": "2-seter",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.204&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
{
"category": "3-seter",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.205&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
{
"category": "hjørnesofaer",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.207&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
{
"category": "lenestoler",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.210&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": ["stressless"]
},
{
"category": "puffer",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.209&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
{
"category": "sofagrupper",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.208&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
{
"category": "sovesofaer",
"link": "https://www.finn.no/bap/forsale/search.html?abTestKey=suggestions&for_rent=Kj%C3%B8p&product_category=2.78.7756.206&segment=1&sort=PUBLISHED_DESC&trade_type=1",
"brand": all_sofa_brands,
"type": [None]
},
]
# Start threads
def start():
# Each under category will be scraped on its own thread
for dictionary_element in sofa_dictionary:
# args only accepts tuple element, therefore we have to include (,)
thread = threading.Thread(target=scrape, args=(dictionary_element,))
thread.start()
print(f"All categories are running on each thread, total threads are {threading.active_count()}")
def scrape_from_ad_description(ad_description_div, array):
# handling None-pointer exception
if ad_description_div is None:
print("[Warning]: ad_desciption div is empty")
return
else:
description_text = ad_description_div.text
description_text_array = description_text.split(" ")
for word in description_text_array:
if word.lower() in array:
return word.lower()
return
# Creating excel file
wb = Workbook()
# Creating work sheet
wb.create_sheet("Sofa")
# Selecting work sheet
ws = wb["Sofa"]
# Adding headers for the sheet "Sofa"
ws.append(["Varenavn", "Kategori", "Pris", "Merke", "Modell", "Postnummer", "Lokasjon", "Finn kode"])
# Naming the output excel file:
filename = "sofa.xlsx"
# this function scrapes date from each under-category
# each under-category will run scrape function in their own thread
def scrape(under_category_object):
under_category_title = under_category_object["category"]
category_link = under_category_object["link"]
global ad_finn_code_span, ad_html_code, ad_price, ad_location
global page_html_code, ad_title, ad_payment_type
number_of_ads_scraped = 0 # used to count number of ads scraped from an under-category
page_number = 1 # used for counting number of pages scraped and to move to next ad-page
count_ad_has_no_price = 0 # used to track number of ads which is for sale but don't have a price,
# these ads will not be added to the excel sheet
while True:
page_link = category_link + "&page=" + str(page_number) # creating page link for each page
page_html_code = requests.get(page_link).text # extracting the html code from website
soup = BeautifulSoup(page_html_code, 'lxml') # making the html-code compact
all_ads_on_page = soup.find_all('article', class_="ads__unit") # finding all ads on the page
# ------------------------------------ Save & exit ------------------------------------
# Ending the script for "this" under-category, if there are no more ads to be scraped
if len(all_ads_on_page) <= 1:
print(f"[END_OF_ADS]: Total ads from category: {under_category_title} collected is {number_of_ads_scraped}")
print(f"[Info] : Total ads that was for sale and had no price from under-category: {under_category_title} is {count_ad_has_no_price}")
wb.save(filename)
return
# ------------------------------------ Entering ad ------------------------------------
for ad in all_ads_on_page:
ad_link_code = ad.find('a', href=True) # extracting the ad_link_code
ad_link = ad_link_code['href'] # extracting the ad_link
# Checking for sponsored ad on the page, the sponsored ad will be ignored because,
# the non-sponsored version of the ad will be scraped, so we avoid duplicate ads
sponsored_ad = ad.find('span', class_="status status--sponsored u-mb8")
if sponsored_ad is not None:
continue
# ------------------------------------ Entered ad ------------------------------------
# trying to enter an ad/fetch data from an ad:
try:
ad_html_code = requests.get(f'{ad_link}').text # fetching the html code for each ad
except AttributeError as err:
print("[Critical] : Error trying to access html text of ad (ad_html_code): ", err)
except:
print("[Critical] : Unexpected error occurred when trying to access html text of ad (ad_html_code):")
soup = BeautifulSoup(ad_html_code, 'lxml') # making the html code compact
# ------------------------------------ Section element ------------------------------------
# each ad inn "finn.no" has a section with class_name "panel u-mb16"
# section has ad-title, ad-payment-type and ad-price
section = soup.find('section', class_="panel u-mb16")
# handling None-pointer exception
if section is not None:
# if the section element exist, extract as much info as possible
try:
ad_title = (section.find('h1', class_="u-t2 u-mt16")).text
ad_payment_type = (section.find('div', class_="u-t4")).text
ad_price = section.find('div', class_="u-t1") # sometimes ads don't have price, therefore no .text
except AttributeError as err:
print("[Critical] : Error trying to access text element of section_element: ", err)
except:
print("[Critical] : Unexpected error occured in section element, line 188")
else:
print(f"[Info] : This ad does not have a section element : {ad_link}")
# ------------------------------------ ad_description ------------------------------------
# finding brand and under-category form ad_description
ad_description_div_element = soup.find('div', class_="preserve-linebreaks")
product_brand = ""
product_model = ""
found_brand = False
found_model = False
# 1. method: finding the brand and model from the ad title:
ad_title_split = ad_title.split(" ")
for word in ad_title_split:
for sofa in sofa_brand_and_model:
# ===================== 1 =====================
# found the sofa brand
if sofa["brand"] == word.lower():
product_brand = word.lower()
found_brand = True
# searching for the model, for that brand
for word_2 in ad_title_split:
if word_2 in sofa["model"]:
product_model = word_2
found_model = True
break
# ===================== 2 =====================
if found_model is False:
if word.lower() in all_sofa_models:
product_model = word.lower()
found_model = True
# ===================== 3 =====================
# using the model to find the brand
if found_brand is False:
for sofa in sofa_brand_and_model:
if product_model in sofa["model"]:
product_brand = sofa["brand"]
found_brand = True
# 2. method: finding the brand and/or model from the ad_description:
if found_brand is False or found_model is False:
if ad_description_div_element is not None:
# if the product model or product brand is not found, we can use the div element to find it
if found_brand is False and found_model is False:
# ===================== 3 =====================
# we dont have the brand nor do we have the
product_brand = scrape_from_ad_description(ad_description_div_element, all_sofa_brands)
if product_brand is not None:
found_brand = True
for sofa in sofa_brand_and_model:
# found the sofa brand, now finding the model
if sofa["brand"] == product_brand:
product_model = scrape_from_ad_description(ad_description_div_element, sofa["model"])
if product_model is not None:
found_model = True
else:
product_model = scrape_from_ad_description(ad_description_div_element, all_sofa_models)
if product_model is not None:
found_model = True
for sofa in sofa_brand_and_model:
# found the sofa brand, now finding the model
if product_model in sofa["model"]:
product_brand = sofa["brand"]
found_brand = True
elif found_brand is True and found_model is False:
# ===================== 2 =====================
product_model = scrape_from_ad_description(ad_description_div_element, all_sofa_models)
if product_model is not None:
found_model = True
else:
for sofa in sofa_brand_and_model:
# found the sofa brand, now finding the model
if product_model in sofa["model"]:
product_brand = sofa["brand"]
found_brand = True
else:
print(f"[Info] : This ad does not have a table nor a description element: {ad_link}")
# ------------------------------------ Location (post nr) ------------------------------------
# finding the postnr for the ads
ad_location_div = soup.find('div', class_="panel u-mt32")
if ad_location_div is None:
print(f"This ad does not have a location element : {ad_link}")
else:
ad_location = ad_location_div.find('h3')
# There are two types of address used in finn.no
# 1. 0231 Oslo
# 2. Gule gata 4, 3487 Kongsberg
# We will handle both here, and extract the post number
comma = ","
if ad_location.text is not None and comma in ad_location.text:
postnr_og_postadreese = ad_location.text.split(",")[-1]
ad_postnr = postnr_og_postadreese.strip().split(" ")[0]
else:
ad_postnr = ad_location.text.strip().split(" ")[0]
# ------------------------------------ Finn code ------------------------------------
# extracting the finn code for each ad, it is located in a div with class = "panel u-text-left":
ad_finn_div_table = soup.find('div', class_="panel u-text-left")
if ad_finn_div_table is not None:
ad_finn_code_span = ad_finn_div_table.find('span', class_="u-select-all")
# ad_finn_code is "finnkode"
ad_finn_code = ""
if ad_finn_code_span is None:
print(f"[Info] : This ad does not have finn code {ad_link}")
else:
try:
ad_finn_code = ad_finn_code_span.text
except AttributeError as err:
print(f"[Critical] : This ad {ad_link} has finn_code_span but no text element", err)
except:
print(f"[Critical]: This ad {ad_link} has finn_code_span, but the program was not able to"
f" extract the finncode, because of an unexpected error! ")
# ------------------------------------ appending to sheet ------------------------------------
# Scraping only "Til Salgs" ads from private sellers, which have a price
if ad_price is None:
count_ad_has_no_price += 1
# Otherwise, splitting the price "kr" and adding it to the sheet
else:
number_of_ads_scraped += 1
price = ad_price.text.replace(" ", "").split("kr")[0]
ws.append(
[ad_title, under_category_title, price,
product_brand, product_model, ad_postnr, "", ad_finn_code])
# ------------------------------------ Page n of undergategory x is done ----------------------------------
# ------------------------------------ next page & save sheet ------------------------------------
# next page
print(f"Page {page_number} of category {under_category_title} is done")
page_number += 1
# Saving file for each page that is scraped
wb.save(filename)
# starting the algorithme
start()