-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathurl2scraper.py
executable file
·113 lines (103 loc) · 4.34 KB
/
url2scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
import requests
import socket
import os
import nmap
import getopt
import sys
import re
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
s = socket.socket()
nm = nmap.PortScanner()
global ports
ports = []
global input_file
input_file = ""
global output_file
output_file = ""
global terminal
terminal = False
global sorted_hrefs
def banner():
os.system('clear')
print("+---------------+")
print("| URL 2 SCRAPER |")
print("+---------------+\n")
print ("----------------------------------------------------------------------|")
print ("| Author: glyph |")
print ("| Title: url2scraper.py |")
print ("| Creation Date: 11/06/2019 |")
print ("| Version Control: 1.0 - Draft Concept |")
print ("| Description: Takes a host file and scrapes the website using GET |")
print ("----------------------------------------------------------------------|")
def usage():
print ("Usage: " + sys.argv[0] + " -i <host file> -o <output file> -t")
print (f"Example: {sys.argv[0]} -i host.txt -o output.txt -t")
def main(input_file, output_file, terminal):
with open (input_file, 'r') as f:
print ("[+] Scraping Websites for embedded HTTP References")
for line in f.readlines():
url = line.strip()
try:
r = requests.head("http://"+url, timeout=1)
if r.status_code == 301 or r.status_code == 302:
r = requests.get("https://"+url, timeout=1)
if r.status_code == 200:
print (f"[+] Host: {url}")
print (f"[+] Looking for embedded URL references on {url}")
hrefs = re.findall ('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', r.text)
sorted_hrefs = set(hrefs)
if terminal == True:
print ("[+] Writing to terminal")
terminal = False
for href in sorted_hrefs:
print (href)
else:
with open (output_file, 'a') as output:
print ("[+] Writing results to file %s") %(sys.argv[4])
for href in set(sorted_hrefs):
output.write(href+'\n')
elif r.status_code == 200:
r = requests.get("http://"+url, timeout=1)
print ("[+] Looking for embedded URL references on %s") %(url)
hrefs = re.findall ('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', r.text)
sorted_hrefs = set(hrefs)
if terminal == True:
print ("[+] Writing to terminal")
terminal = False
for href in sorted_hrefs:
print (href)
else:
with open (output_file, 'a') as output:
print ("[+] Writing results to file %s") %(sys.argv[4])
for href in set(sorted_hrefs):
output.write(href+'\n')
else:
print ("[-] Host %s returned status %s") %(url,r.status_code)
except:
pass
if __name__=='__main__':
banner()
if len(sys.argv) < 3:
usage()
else:
try:
opts, args = getopt.gnu_getopt(sys.argv,"hio:t",["ifile=","ofile=","terminal="])
for opt, arg in opts:
if opt == "-h":
usage()
sys.exit()
elif opt in ("-i", "--ifile"):
input_file = sys.argv[2]
elif opt in ("-o", "--ofile"):
output_file = sys.argv[4]
elif opt in ("-t", "--terminal"):
terminal = True
else:
usage()
sys.exit()
main(input_file, output_file, terminal)
except getopt.GetoptError as err:
print (err)
usage()
sys.exit(2)