-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExercise 9
97 lines (88 loc) · 4.12 KB
/
Exercise 9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from gne.utils import pre_parse, remove_noise_node, config, html2element, normalize_text
from gne.extractor import ContentExtractor, TitleExtractor, TimeExtractor, AuthorExtractor, ListExtractor
from selenium import webdriver
import asyncio
import random
import time
import warnings
warnings.filterwarnings("ignore")
import aiohttp
import asyncio
class GeneralNewsExtractor:
async def fetch(self, session, urls):
async with session.get(urls) as resp:
assert resp.status == 200
return await resp.text()
async def url_to_sourcecode(self, queue, urls):
async with aiohttp.ClientSession() as client:
s = time.perf_counter()
for i in urls:
html = await GeneralNewsExtractor().fetch(client, i)
await queue.put(html)
await asyncio.sleep(random.random())
# await client.close()
await queue.put(None)
elapsed = time.perf_counter() - s
print(f"{__file__} executed in {elapsed:0.2f} seconds.")
async def extract(self,
queue = None,
# html,
title_xpath='',
author_xpath='',
publish_time_xpath='',
host='',
body_xpath='',
noise_node_list=None,
with_body_html=False):
# print("Naaru!")
# 对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用
# 因此,如果指定了 title_xpath/author_xpath/publish_time_xpath,那么需要先提取再进行
# 预处理
# while True:
# # wait for an item from the producer
# item = await queue.get()
# if item is None:
# # the producer emits None to indicate that it is done
# break
while True:
item = await queue.get()
if item is None:
# the producer emits None to indicate that it is done
break
html = item
normal_html = normalize_text(html)
element = html2element(normal_html)
title = TitleExtractor().extract(element, title_xpath=title_xpath)
publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
element = pre_parse(element)
remove_noise_node(element, noise_node_list)
content = ContentExtractor().extract(element,
host=host,
with_body_html=with_body_html,
body_xpath=body_xpath)
result = {'title': title,
'author': author,
'publish_time': publish_time,
'content': content[0][1]['text'],
'images': content[0][1]['images']
}
if with_body_html or config.get('with_body_html', False):
result['body_html'] = content[0][1]['body_html']
print(result)
class ListPageExtractor:
def extract(self, feature):
html = GeneralNewsExtractor().url_to_sourcecode()
normalize_html = normalize_text(html)
element = html2element(normalize_html)
extractor = ListExtractor()
return extractor.extract(element, feature)
urls = eval(input("Enter the list of url ['example.com', 'example.in']:"))
# loop = asyncio.get_event_loop()
# loop.run_until_complete(o.extract())
loop = asyncio.get_event_loop()
queue = asyncio.Queue(loop=loop)
producer_coro = GeneralNewsExtractor().url_to_sourcecode(queue, urls)
consumer_coro = GeneralNewsExtractor().extract(queue)
loop.run_until_complete(asyncio.gather(producer_coro, consumer_coro))
loop.close()