This repository has been archived by the owner on Apr 2, 2024. It is now read-only.
forked from GurjotSinghAulakh/Web-Scraper-API
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHvitevarer_V1_Live.py
419 lines (342 loc) · 15.6 KB
/
Hvitevarer_V1_Live.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
'''
In this version we will scrape the category "Hvitevarer",
and all its under categories, with threading enabled.
They will all be saved on the file (CurrentDate).xlsx
It will run live, and save both a backup file and a
big file where huge data will sit
'''
import threading
import time
from datetime import datetime, date # to measure the speed of the the algorithm
from threading import Timer
import logging
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%d-%m-%Y %H:%M:%S',
filename='backuplog.log')
import requests # to make request (html-request)
from bs4 import BeautifulSoup # to make the html code compact
from openpyxl import Workbook # To create excel sheets
# TODO: Add more categories, such as electronics
# Creating brand arrays:
appliances_brand = ["samsung", "bosch", "miele", "whirlpool", "electrolux", "grundig", "siemens", "zanussi",
"bauknecht", "upo", "point", "gram", "ikea", "lg", "gorenje", "candy", "aeg", "husqvarna",
"kenwood", "matsui", "scandomestic", "senz"]
# used for sorting the ads which does not have specified what under-category it belongs to
appliance_under_category = ["frysere", "innbyggingsovner", "kjøleskap", "komfyrer", "mikrobølgeovner",
"oppvaskmaskiner", "platetopper", "tørketromler", "vaskemaskiner", "ventilatorer"]
# Dictionary contains information about each product we can scrape,
# as of now we have only implemented it for the appliance
appliances_dictionary = [
{
"category": "andre hvitevarer",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.305&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": appliance_under_category,
"finnkode": []
},
{
"category": "frysere",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.72&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["fryseboks", "fryseskap", "fryser"],
"finnkode": []
},
{
"category": "innbyggingsovner",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.74&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["stekeovn", "dampovn", "med platetopp"], ## Sendere ta med platetopp, sjekk for mer data på finn
"finnkode": []
},
{
"category": "kjøleskap",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.292&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["kombiskap", "fryser", "side by side"],
"finnkode": []
},
{
"category": "komfyrer",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.73&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["med keramisk", "gasskomfyr"],
"finnkode": []
},
{
"category": "mikrobølgeovner",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.77&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": [None],
"finnkode": []
},
{
"category": "oppvaskmaskiner",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.78&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": [None],
"finnkode": []
},
{
"category": "platetopper",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.75&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["induksjon", "keramisk"],
"finnkode": []
},
{
"category": "tørketromler",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.80&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": [None],
"finnkode": []
},
{
"category": "vaskemaskiner",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.79&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": ["tørketrommel"],
"finnkode": []
},
{
"category": "ventilatorer",
"link": "https://www.finn.no/bap/forsale/search.html?product_category=2.93.3907.76&segment=1&sort=PUBLISHED_DESC",
"brand": appliances_brand,
"type": [None],
"finnkode": []
}
]
all_finn_code_array = []
count_ad_has_no_price = 0
count_no_to_sale = 0
count_to_sale = 0
def start():
# Threads: creating the backup file
save_file_thread = threading.Thread(backup_file())
save_file_thread.start()
# Threads: creating the data file
save_file_thread = threading.Thread(time_to_save_file())
save_file_thread.start()
round_counter = 1
while True:
for dictionary_element in appliances_dictionary:
scrape(dictionary_element, all_finn_code_array)
time.sleep(3)
# Round counter
logging.info(f"Round {round_counter} is finished")
# Counter : Products for sale
logging.info(f"[COUNTER]: Products for sale: {count_to_sale}")
# Counter : Products for sale with no price
logging.info(f"[COUNTER]: Products for sale with no price: {count_ad_has_no_price}")
# Counter : Products not for sale
logging.info(f"[COUNTER]: Products not for sale (gis bort/ønskes kjøpt): {count_no_to_sale}")
round_counter += 1
def scrape_brand_from_add_description(div_element, brand_array):
# handling None-pointer exception
if div_element is None:
return "EMPTY"
else:
description_text = div_element.text
description_text_array = description_text.split(" ")
for word in description_text_array:
if word.lower() in brand_array:
return word.lower()
return "Annet merke"
def scrape_type_from_add_description(div_element, type_array):
# handling None-pointer exception
if div_element is None:
return "EMPTY"
else:
description_text = div_element.text
description_text_array = description_text.split(" ")
for word in description_text_array:
if word.lower() in type_array:
return word.lower()
return None
# This part of the code is hard coded for the category "hvitevarer"
# TODO: make this part of the code dynamic!
wb = Workbook()
wb.create_sheet("Hvitevarer")
ws = wb["Hvitevarer"]
ws.append(["Varenavn", "Kategori", "Under-kategori", "Pris", "Merke", "Postnummer", "Lokasjon", "Finn Kode"])
# Time function: that will start a thread every day at 00:00 at midnight
def time_to_save_file():
# 24-hours : (seconds: 86_400, function)
twentyfour_hours = Timer(3600, save_file_everyday)
twentyfour_hours.start()
# Time function: that will start a thread every 2nd hour
def backup_file():
# 2-hours : (seconds: 7200, function)
two_hours = Timer(1800, save_every_two_hours)
two_hours.start()
# A function that will save a excel file with scrapped data, at 00:00 O´olock
def save_file_everyday():
today = date.today()
name = str(today) + ".xlsx"
file_name = name
# TODO: We have to change "Absolutt Path", before we will run the algorithm
wb.save("./[LIVE] Scrapped Data/" + file_name)
print("hello world")
time_to_save_file()
# A function that will save a excel file with scrapped data, every 2nd hour
def save_every_two_hours():
today = date.today()
name = "backup_" + str(today) + ".xlsx"
file_name = name
# TODO: We have to change "Absolutt Path", before we will run the algorithm
wb.save("./[LIVE] Backup data/" + file_name)
print("Bye world")
backup_file()
# this function scrapes date from each under-category
def scrape(under_category_object, all_finn_code_array):
under_category_title = under_category_object["category"]
category_link = under_category_object["link"]
brand_array = under_category_object["brand"]
type_array = under_category_object["type"]
global count_no_to_sale, ad_finn_code_span, ad_html_code, page_html_code, ad_title, ad_payment_type, ad_price, ad_location
global count_to_sale
global count_ad_has_no_price
counter_old_ad = 1
print(f"[LIVE]: Now scraping {under_category_title}")
# defining a work excel book
ws = wb["Hvitevarer"]
page_link = category_link + "&page=1" # creating page link for each page
try:
page_html_code = requests.get(page_link).text # extracting the html code from website
except IOError:
logging.critical(f"Page 1 of {under_category_title} does not exist")
soup = BeautifulSoup(page_html_code, 'lxml') # making the html-code compact
all_ads_on_site = soup.find_all('article', class_="ads__unit") # finding all ads in the category
# entring each ad...
for ad in all_ads_on_site:
ad_link_code = ad.find('a', href=True) # fetching the ad_link
ad_link = ad_link_code['href'] # fetching the ad_link
# checking if there are any sponsored ads on this category/site
sponsored_ad = ad.find('span', class_="status status--sponsored u-mb8")
if sponsored_ad is not None:
ad_link = "https://www.finn.no" + ad_link
try:
ad_html_code = requests.get(f'{ad_link}').text # fetching the html code for each ad
except IOError:
logging.critical(f"Ad link does not exist {ad_link}")
soup = BeautifulSoup(ad_html_code, 'lxml') # making the html code compact
# extracting the finn code for each ad:
try:
ad_finn_div_table = soup.find('div', class_="panel u-text-left")
if ad_finn_div_table is not None:
ad_finn_code_span = ad_finn_div_table.find('span', class_="u-select-all")
except IOError:
logging.critical(f"Div table for ad: {ad_link} does not exist")
# ad_finn_code is "finnkode"
ad_finn_code = ""
if ad_finn_code_span is None:
logging.info(f"This ad does not have finn code {ad_link}")
else:
try:
ad_finn_code = ad_finn_code_span.text
except IOError:
logging.warning(f"This ad {ad_link} has finn_code_span but no text element")
# TODO: Go to next undercat. if there are no new ads 3x
# checking for duplicate:
if ad_finn_code in all_finn_code_array:
logging.info(f"[SKIP]: no new ad in category {under_category_title}")
if counter_old_ad == 5:
print("Vi har sett at vi har fått 5 gamle reklamer på rad!")
break
counter_old_ad += 1
continue
else:
all_finn_code_array.insert(0, ad_finn_code)
logging.info(f"[NEW] : New ad is found... {ad_finn_code}, {ad_link}")
while len(all_finn_code_array) > 600:
all_finn_code_array.pop()
# each ad inn "finn.no" has a section with class_name "panel u-mb16"
# section has information about ad-title, ad-payment-type and ad-price
section = soup.find('section', class_="panel u-mb16")
# handling None-pointer exception
if section is None:
logging.warning(f"This ad does not have a section element : {ad_link}")
else:
try:
ad_title = (section.find('h1', class_="u-t2 u-mt16")).text
ad_payment_type = (section.find('div', class_="u-t4")).text
ad_price = section.find('div', class_="u-t1")
except IOError:
logging.critical("Error trying to access text element of section_element")
# finding the postnr for the ads
ad_location_div = soup.find('div', class_="panel u-mt32")
if ad_location_div is None:
logging.warning(f"This ad does not have a location element : {ad_link}")
else:
ad_location = ad_location_div.find('h3')
# There are two types of address used in finn.no
# 1. 0231 Oslo
# 2. Gule gata 4, 3487 Kongsberg
# We will handle both here, and extract the post number
comma = ","
if ad_location.text is not None and comma in ad_location.text:
postnr_og_postadreese = ad_location.text.split(",")[-1]
ad_postnr = postnr_og_postadreese.strip().split(" ")[0]
else:
ad_postnr = ad_location.text.strip().split(" ")[0]
# finding additional data about the ad
table_additional_info_html_code = soup.find('table', class_="u-width-auto u-mt16")
ad_description = soup.find('div', class_="preserve-linebreaks")
# finding product brand and type (under-under category)
product_brand = ""
product_type = ""
found_brand = False
found_type = False
# 1. method: finding the brand and type for the product from the ad title:
ad_title_split = ad_title.split(" ")
for word in ad_title_split:
if word.lower() in brand_array:
product_brand = word.lower()
found_brand = True
if word.lower() in type_array:
product_type = word.lower()
found_type = True
# 2. method: finding the brand and type for the product from the ad table:
if found_brand is False or found_type is False:
if table_additional_info_html_code is not None:
table_td = (table_additional_info_html_code.find_all('td', class_="u-pl16"))
for td in table_td:
if td.text.lower() in brand_array:
product_brand = td.text.lower()
found_brand = True
if td.text.lower() in type_array:
product_type = td.text.lower()
found_type = True
# 3. method: finding the brand and type for the product from description:
if found_brand is False:
if ad_description is not None:
product_brand = scrape_brand_from_add_description(ad_description, brand_array)
else:
logging.warning(f"This ad does not have a description element: {ad_link}")
if found_type is False:
if ad_description is not None:
product_type = scrape_type_from_add_description(ad_description, type_array)
else:
logging.warning(f"This ad does not have a description element: {ad_link}")
# If table is empty, we go straight to scrapping the description
elif ad_description is not None :
product_type = scrape_type_from_add_description(ad_description, type_array)
product_brand = scrape_brand_from_add_description(ad_description, brand_array)
else:
logging.critical(f"This ad does not have a data table and description element: {ad_link}")
# Scraping only "Til Salgs ads" from finn.no
if ad_payment_type.lower() == "til salgs":
# handling None-pointer exception
if ad_price is None:
count_no_price += 1
pass
# Otherwise, splitting the price "kr" and adding it to the sheet
else:
count_to_sale += 1
price = ad_price.text.replace(" ", "").split("kr")[0]
ws.append([ad_title, under_category_title, product_type, price, product_brand, ad_postnr,"", ad_finn_code])
else:
count_no_to_sale += 1
# Starting the algorithm (Scrapping)
start()