-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdomainSpider.py
213 lines (150 loc) · 7.55 KB
/
domainSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#import tika
from bs4 import BeautifulSoup
import sys
import traceback
import sqlite3 as db
import queue
from urllib.error import HTTPError, URLError
from urllib import request
from urllib import parse
import requests
from bs4 import UnicodeDammit
import time
from socket import gaierror
import spiderDatabase
"""
This program is meant to be a fairly simple webcrawler. It can be given
any number of domains, and will begin crawling. It will record all links,
raw html, and plain text to a sqlite3 database. It will seperately record
links to PDFs, images, and media files, but not follow them. It will follow any link that leads to a page within any of the known domains, but not to exterior sites.
On the initial visit to a domain, if it detects that there has been a redirect, it will record it and add the new domain to the domains list.
"""
class DomainSpider():
def __init__(self, domain="", crawlerDelay=0.0, dhHost, dbUser, dbName):
self.__currentDomain = domain
self.__database = spiderDatabase.SpiderDatabase(dbHost, dbUser, dbName)
self.__urlQueue = queue.Queue()
self._visitedURLs = []
self._hopCount = 0
self.crawlerDelay = crawlerDelay
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
def addDomain(self, domain):
print("Adding domain:",domain)
self.__database.addDomain(domain)
def crawl(self):
url = "first URL"
allDomains = self.__database.getAllDomains()
print(allDomains)
for domain in self.__database.getIncompleteDomains():
print("\n[+] Beginning domain:",domain)
self._hopCount = 0
while self.__database.getNumberOfLinks(domain):
domain, source, url = self.__database.getLink(domain)
print(
" [*] Queue size: ",
self.__database.getNumberOfLinks(domain)
)
print(" [*] loading:",url)
try:
urlData = requests.get(url, headers=self.headers)
# If we tried to pull a domain, and got a redirect:
if ( url==domain ) and url != urlData.url:
self.__database.addRedirect(domain, urlData.url)
url = urlData.url
domain = urlData.url
allDomains.append( domain )
print(" [*] Added redirect to:",domain)
# rawURLData = urlData
# tmp = requests.utils.get_unicode_from_response(urlData)
# print(" [*] Encoding type:",tmp)
urlData = requests.utils.get_unicode_from_response(urlData)
except:
traceback.print_exc(file=sys.stdout)
self.__database.removeLinkFromQueue(url)
self.__database.addError(domain, "unknown error", url)
continue
print(" [*] URL Data read")
bs = None
try:
bs = BeautifulSoup(urlData, "html5lib")
print(" [*] BeautifulSoup parsed")
# Strip out code and stuff
for script in bs(["script", "style"]):
script.extract()
except:
bs = BeautifulSoup("", "html5lib")
self.__database.addError(domain, "Could not parse BS data",url)
print(" [x] Error: BeautifulSoup could not parse")
plainText = ""
try:
plainText = '\n'.join(
[ e.strip() for e in bs.getText().split("\n") if len(e.strip()) ]
)
except UnicodeDecodeError as e:
traceback.print_exc(file=sys.stdout)
self.__database.addError(domain, str(e),url)
print("\n --> error!! <--\n")
except:
traceback.print_exc(file=sys.stdout)
self.__database.addError(domain, "An unknown error while reading BS lines",url)
print(" [*] Saving data ....")
# Always dump whatever data we got.
self.__database.insertPageData(domain, url, urlData, plainText)
print(" [*] Dealing with links")
# Do all the link stuff
for link in bs.findAll("a", href=True):
# Don't follow intrapage links
if link['href'].startswith("#"):
continue
# Don't follow media links
if link['href'].endswith(".mp3") or link['href'].endswith(".avi") or link['href'].endswith(".mov") or link['href'].endswith(".mpeg") or link['href'].endswith(".ppt") or link['href'].endswith(".pptx"):
continue
# Make sure it is a full link, and lower case
link['href'] = parse.urljoin(url, link['href'])#.lower()
# Make sure it is prefaced with http/https
if not link['href'].startswith("http") and not link['href'].startswith("https"):
link['href'] = "http://"+link['href']
# TODO: strip away urls ending with #something
# this part needs work. Sometimes the trailing / is needed
# if link['href'].endswith("/"):
# link['href'] = link['href'][:-1]
# Check if it is a link to an image
if any([ link['href'].endswith(e) for e in [".jpg", ".png", ".peg", ".gif", '.tff']] ):
self.__database.addImageLink( domain, url, link['href'] )
# Check if it is a PDF link
elif link['href'].endswith(".pdf"):
self.__database.addPDFLink( domain, url, link['href'])
# Must be a regular page link
else:
# Make sure we haven't already visited this page
if any( e in link['href'] for e in allDomains ) and not self.__database.visitedLink( link['href'] ) and url != link['href']:
self.__database.addLink( domain, url, link['href'] )
self.__database.removeLinkFromQueue(url)
self._hopCount += 1
time.sleep( self.crawlerDelay )
# Flag the domain as being complete
self.__database.finishDomain(domain)
print(" [O] Completed domain:",domain)
if __name__ == "__main__":
ds = DomainSpider(
dbHost="enterprise.local",
dbUser="hopperj",
dbName="testing"
)
# with open("websiteOutput.txt", 'r') as f:
with open("fakeWebsites.txt", 'r') as f:
websites = f.readlines()
for website in websites[:]:
# print("\n\n\n****************************************************")
# print("Website: ",website,"\n")
ds.addDomain(website.strip())
print("\nBegin the crawl!")
ds.crawl()
# print("\n****************************************************")