From d3466d3aa50342112178658f798f3486df3fc5dd Mon Sep 17 00:00:00 2001 From: KhasMek Date: Sat, 30 Dec 2017 09:20:23 -0700 Subject: [PATCH] domain_history: fix false negative results Netcraft's is heavily reliant upon javascript to check for automated scraping and rendering of the page. Switching over to selinium from requests and rendering the page with either PhantomJS, Chrome or Firefox circumvents this issue and actually returns results if present. --- README.md | 1 + docs/index.md | 4 +++- domain/domain_history.py | 48 +++++++++++++++++++++++++++++++--------- requirements.txt | 1 + 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a58fb051..8ee7afae 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Options: # Required Setup: * Python 2.7 (because bunch of dependencies do not support Python 3.0) * Bunch of python libraries (use requirements.txt) +* [PhantomJS](http://phantomjs.org), [Firefox](https://www.mozilla.org/firefox) or [Chrome](https://www.google.com/chrome) (to assist in rendering websites that are javascript heavy) ## Detailed Tool Documentation: diff --git a/docs/index.md b/docs/index.md index f115ebca..9be0127b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,9 @@ People can either write modules for DataSploit or can simpley import datasploit Worried about setup? We got you. You should be worried about two things: -* Install the required python dependencies. Either use requirements.txt or simpley pip install datasploit. +* Install dependencies + - Python dependencies: Either use requirements.txt (`pip install -r requirements.txt`) if installed with `git clone` or simply `pip install datasploit` to install everything needed as a library. + - System dependencies: PhantomJS, Chrome or Firefox needs to be installed for certain modules to render javascript heavy websites. * Feeding specific API keys for few specific sources. We are going to have a knowledge base where step by step instructions to generate these API keys will be documented. Sweet deal? * [Click here to check step by step setup guide](/setupGuide/) diff --git a/domain/domain_history.py b/domain/domain_history.py index ff8fe69c..3124c49f 100755 --- a/domain/domain_history.py +++ b/domain/domain_history.py @@ -2,9 +2,10 @@ import base import sys -import requests from bs4 import BeautifulSoup import re +from selenium import webdriver +from selenium.common.exceptions import WebDriverException from termcolor import colored import time @@ -20,14 +21,31 @@ def netcraft_domain_history(domain): ip_history_dict = {} time.sleep(0.3) endpoint = "http://toolbar.netcraft.com/site_report?url=%s" % (domain) - req = requests.get(endpoint) - - soup = BeautifulSoup(req.content, 'html.parser') - urls_parsed = soup.findAll('a', href=re.compile(r'.*netblock\?q.*')) - for url in urls_parsed: - if urls_parsed.index(url) != 0: - ip_history_dict[str(url).split('=')[2].split(">")[1].split("<")[0]] = str(url.parent.findNext('td')).strip( - "").strip("") + # These try's could be in a for loop, but I wanted manual control + # over the order in which the webdrivers were chosen. + driver = None + try: + webdriver.PhantomJS() + driver = webdriver.PhantomJS() + except WebDriverException: + try: + webdriver.Firefox().quit() + driver = webdriver.Firefox() + except WebDriverException: + try: + webdriver.Chrome().quit() + driver = webdriver.Chrome() + except WebDriverException: + ip_history_dict = { 'Error': 'No WebDriver Found!\nTry installing PhantomJS or adding the Chrome or Firefox binaries to your $PATH.'} + if driver: + driver.get(endpoint) + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser') + urls_parsed = soup.findAll('a', href=re.compile(r'.*netblock\?q.*')) + for url in urls_parsed: + if urls_parsed.index(url) != 0: + ip_history_dict[url['href'].split('=')[1]] = url.get_text() + driver.quit() return ip_history_dict @@ -40,8 +58,16 @@ def main(domain): def output(data, domain=""): - for x in data.keys(): - print "%s: %s" % (data[x], x) + if len(data.keys()) > 0: + for x in data.keys(): + if 'Error' in x: + print data[x] + data[x] = '' + else: + print "%s: %s" % (data[x], x) + else: + print colored(style.BOLD + '\n[!] No previous domain owners found!\n' + + style.END, 'red') print "\n-----------------------------\n" diff --git a/requirements.txt b/requirements.txt index 3371c785..e9fc34eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ python-whois pytz requests requests-file +selenium simplejson termcolor tld