-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
145 lines (116 loc) · 4.85 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Parse out text, links, images, and more from an HTML file.
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
For example:
import extract
e = extract.ParsedWebpage("http://en.wikipedia.org/wiki/Frog")
print e.title
print e.text
print e.links
"""
import copy
import json
import re
from urlparse import urljoin
import bs4
import requests
import secrets
class ClarifaiGetter(object):
def __init__(self):
self.n_clarifai = 0
def clarifai_tags(self, url):
# TODO: Batch this so that we do one Clarifai request only.
# Requires, like, deferreds.
if self.n_clarifai > 4:
return []
self.n_clarifai += 1
access_token = secrets.clarifai_access_token
clarifai_url = "https://api.clarifai.com/v1/tag/?url="
response = requests.get(clarifai_url + url,
headers={
'Authorization': ' Bearer %s' % access_token,
})
# Consult https://developer.clarifai.com/docs/tag
try:
retval = json.loads(response.text)['results'][0]['result']['tag']['classes']
except:
print json.loads(response.text)
retval = []
# Sometimes Clarifai returns [["tag1", "tag2", "tag3"]] instead of
# just ["tag1", "tag2", "tag3"].
if len(retval) == 1 and type(retval[0]) == list:
return retval[0]
return retval
class ParsedWebpage(object):
def __init__(self, url):
# Raw HTML
response = requests.get(url, verify=False, headers={
'User-agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; en) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3',
})
self.html = response.text
self.url = response.url
self.soup = bs4.BeautifulSoup(self.html, "html.parser")
self.clarifai_getter = ClarifaiGetter()
# Delete <script> and <style> tags, comments, and <!DOCTYPE>.
# For some reason, doing this twice removes some sticky cases.
for i in range(2):
[s.extract() for s in self.soup.find_all('script')]
[s.extract() for s in self.soup.find_all('style')]
[s.extract() for s in self.soup.find_all('form')]
comments = self.soup.findAll(text=lambda text:isinstance(text, bs4.Comment))
[comment.extract() for comment in comments]
new_html = re.sub("<!--.*?-->", "", unicode(self.soup))
new_html = re.sub("<!DOCTYPE[^>]*>", "", new_html)
self.soup = bs4.BeautifulSoup(new_html, "html.parser")
# This should be something acceptable to read to the user
# as the webpage's title.
self.title = self.soup.title.string
# Replace images with descriptions of those images.
def my_replace(match):
raw_tag = match.group()
img_soup = bs4.BeautifulSoup(raw_tag, "html.parser")
src = img_soup.img.get("src")
alt = img_soup.img.get("alt")
retval = " An image"
if alt:
retval += " of %s" % alt
elif src:
joined_url = urljoin(self.url, src)
tags = self.clarifai_getter.clarifai_tags(joined_url)[:4]
if len(tags) > 1:
tags[-1] = "and " + tags[-1]
if len(tags) > 0:
retval += " that looks like "
retval += ' '.join(tags)
return retval + '. '
new_html = re.sub("<img[^>]*\>[^>]*<\\img\>", my_replace, unicode(self.soup))
new_html = re.sub("<img[^>]*\>", my_replace, new_html)
self.soup = bs4.BeautifulSoup(new_html, "html.parser")
# This should be a list of (link name, link href) pairs.
links = self.soup.find_all('a')
self.links = [(link.string, urljoin(url, link.get('href', '')))
for link in links
if link.string]
texts = self.soup.find_all(text=True)
# Add in link labels!
link_index = 0
new_texts = []
for text in texts:
if link_index < len(self.links) and text == self.links[link_index][0]:
new_texts.append("Link %s" % str(link_index))
link_index += 1
if text.strip() != '':
new_texts.append(text)
# This should be the human-readable text of the page.
self.text = ' '.join(new_texts)
max_chunk_length = 1000 # characters
self.chunks = [""]
current_chunk = 0
for text in new_texts:
if len(self.chunks[current_chunk]) + len(text) > max_chunk_length:
self.chunks.append("")
current_chunk += 1
self.chunks[current_chunk] += text + ' '
# TODO: forms and related trappings
# TODO: iframes?
# TODO: music and videos