-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscrape_urls.py
69 lines (51 loc) · 1.94 KB
/
scrape_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
import multiprocessing
import urllib.request
from bs4 import BeautifulSoup
from multiprocessing import cpu_count, Pool
def scrape_one_url(my_list):
my_idx, _, my_url, = my_list
try:
resp = urllib.request.urlopen(my_url)
my_soup = str(BeautifulSoup(resp, my_parser, from_encoding=resp.info().get_param('charset')))
with open('./data/docs/raw/' + str(my_idx) + '.txt', 'w', encoding='utf8') as file:
file.write(my_soup)
except:
pass
# = = = =
n_cores = cpu_count()
my_parser = 'lxml'
# = = get unique urls = =
with open('./data/links/urls.txt', 'r', encoding='utf8') as file:
cat_urls = file.read().splitlines()
# '=====' was added manually at the end of urls.txt to separate new URLs from that of previous runs
idx_new = [idx for idx,elt in enumerate(cat_urls) if '=====' in elt]
# we don't want to be parsing the URLs from previous runs
if len(idx_new)>0:
idx_new = idx_new[0] + 1 # +1 is to skip the '=====' line
cat_urls = cat_urls[idx_new:]
else:
idx_new = 0
print('starting from idx:',idx_new)
already_seen = set()
unique_cat_urls = []
counter = idx_new
for cat_url in cat_urls:
cat_url_split = cat_url.split(',')
if cat_url_split[1] not in already_seen:
unique_cat_urls.append([counter] + cat_url_split)
already_seen.add(cat_url_split[1])
counter += 1 # we need that to keep ordering when we pool map
print(len(unique_cat_urls),'unique urls')
# we are not writing the unique indexes as they are equal to the line numbers
with open('./data/links/urls_final.txt', 'a', encoding='utf8') as file:
for elt in unique_cat_urls:
file.write(','.join(elt[1:]) + '\n')
# = = get raw HTML of the urls = =
start_time = time.time()
print(n_cores,'core(s) will be used')
pool = Pool(processes=n_cores)
pool.map(scrape_one_url, unique_cat_urls)
pool.close()
pool.join()
print('= = = done in',round(time.time() - start_time,2),'sec(s)= = =')