-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_helper.py
36 lines (29 loc) · 1.04 KB
/
html_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib.request
import urllib.parse
import re
def get_html_content(url):
print(url)
f = urllib.request.urlopen(url)
return f.read().decode('gb2312', errors='ignore')
def extra_content(html):
title = re.search(
r'<div class="nr_title" id="nr_title">(.*?)</div>', html, re.S)
content = re.search(r'<div id="nr1">(.*?)</div>', html, re.S)
next_page = re.search(
r'<a id="pt_next" href="(.*?)">(.*?)</a>', html, re.S)
result = {
'title' : title.group(1).strip(),
'content': clean_up_tags(content.group(1).strip()),
'next_page': next_page.group(2),
'next_link': next_page.group(1)
}
return result
def url_parse(url):
url_info = urllib.parse.urlparse(url)
return url_info.scheme + '://' + url_info.netloc
def clean_up_tags(html_content):
result = html_content.replace('<br />', '').replace(' ', '')
a_tags = re.search(r'<a (.*?)>(.*)</a>', html_content, re.S)
if not a_tags is None:
result = result.replace(a_tags.group(), '')
return result