-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_tools.py
105 lines (74 loc) · 2.63 KB
/
html_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
import urllib
import requests
from bs4 import BeautifulSoup
'''
获取百度知道的页面
'''
def get_html_zhidao(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)Gecko/20071127 Firefox/2.0.0.11'}
soup_zhidao = BeautifulSoup(requests.get(url=url, headers=headers).content, "lxml")
# 去除无关的标签
[s.extract() for s in soup_zhidao(['script', 'style', 'img'])]
# print(soup.prettify())
return soup_zhidao
'''
获取百度百科的页面
'''
def get_html_baike(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)Gecko/20071127 Firefox/2.0.0.11'}
soup_baike = BeautifulSoup(requests.get(url=url, headers=headers).content, "lxml")
# 去除无关的标签
[s.extract() for s in soup_baike(['script', 'style', 'img', 'sup', 'b'])]
# print(soup.prettify())
return soup_baike
'''
获取Bing网典的页面
'''
def get_html_bingwd(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)Gecko/20071127 Firefox/2.0.0.11'}
soup_bingwd = BeautifulSoup(requests.get(url=url, headers=headers).content, "lxml")
# 去除无关的标签
[s.extract() for s in soup_bingwd(['script', 'style', 'img', 'sup', 'b'])]
# print(soup.prettify())
return soup_bingwd
'''
获取百度搜索的结果
'''
def get_html_baidu(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)Gecko/20071127 Firefox/2.0.0.11'}
soup_baidu = BeautifulSoup(requests.get(url=url, headers=headers).content.decode('utf-8'), "lxml")
# 去除无关的标签
[s.extract() for s in soup_baidu(['script', 'style', 'img'])]
# print(soup.prettify())
return soup_baidu
'''
获取Bing搜索的结果
'''
def get_html_bing(url):
# url = 'http://global.bing.com/search?q='+word
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
soup_bing = BeautifulSoup(requests.get(url=url, headers=headers).content.decode('utf-8'), "lxml")
# 去除无关的标签
# [s.extract() for s in soup_bing(['script', 'style','img'])]
return soup_bing
'''
print answer
'''
def ptranswer(ans, ifhtml):
result = ''
# print ans
for answer in ans:
if ifhtml:
print(answer)
else:
if answer == u'\n':
continue
p = re.compile('<[^>]+>')
result += p.sub("", answer.string).encode('utf8')
return result
def ltptools(args):
url_get_base = "http://api.ltp-cloud.com/analysis/"
result = urllib.urlopen(url_get_base, urllib.urlencode(args)) # POST method
content = result.read().strip()
return content