-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_engine
118 lines (105 loc) · 3.36 KB
/
search_engine
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def record_user_click(index,keyword,url):
"""
Simulates user clicks
"""
urls = lookup(index, keyword)
if urls:
for entry in urls:
if entry[0] == url:
entry[1] = entry[1] + 1
def add_to_index(index,keyword,url):
"""
Modifies index by adding url to a keyword found in the index.
If the keyword is not yet in the index, adds [keyword, [url]] to the index
"""
for entry in index:
if entry[0] == keyword:
for each in entry[1]:
if url == each[0]:
return
entry[1].append([url, 0])
return
# not found, add new keyword to index
index.append([keyword, [[url, 0]]])
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body></html>'''
elif url == "http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I am
quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''
elif url == "http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a>!</body></html>'''
elif url == "http://www.udacity.com/cs101x/flying.html":
return '<html><body>The magic words are Squeamish Ossifrage!</body></html>'
except:
return ""
return ""
def union(a, b):
"""
appends all elements of b to a
"""
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
"""
Finds a url on source code page
"""
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
"""
returns all html links on a source page
"""
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
"""
Main crawler function.
Takes a seed page and adds links on that page to the main index.
"""
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content))
crawled.append(page)
return index
def lookup(index,keyword):
"""
Returns urls associated with the input keyword
"""
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
def add_page_to_index(index,url,content):
"""
Modifies the index by taking the contents of a page (content),
adding each word to the index along with the url
"""
[add_to_index(index, each, url) for each in content.split()]