From d3466d3aa50342112178658f798f3486df3fc5dd Mon Sep 17 00:00:00 2001
From: KhasMek <Boushh@gmail.com>
Date: Sat, 30 Dec 2017 09:20:23 -0700
Subject: [PATCH] domain_history: fix false negative results

  Netcraft's is heavily reliant upon javascript to check
  for automated scraping and rendering of the page.
  Switching over to selinium from requests and rendering the
  page with either PhantomJS, Chrome or Firefox circumvents
  this issue and actually returns results if present.
---
 README.md                |  1 +
 docs/index.md            |  4 +++-
 domain/domain_history.py | 48 +++++++++++++++++++++++++++++++---------
 requirements.txt         |  1 +
 4 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index a58fb051..8ee7afae 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ Options:
 # Required Setup:
 * Python 2.7 (because bunch of dependencies do not support Python 3.0)
 * Bunch of python libraries (use requirements.txt)
+* [PhantomJS](http://phantomjs.org), [Firefox](https://www.mozilla.org/firefox) or [Chrome](https://www.google.com/chrome) (to assist in rendering websites that are javascript heavy)
 
 
 ## Detailed Tool Documentation:
diff --git a/docs/index.md b/docs/index.md
index f115ebca..9be0127b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -27,7 +27,9 @@ People can either write modules for DataSploit or can simpley import datasploit
 
 Worried about setup? We got you. You should be worried about two things:
 
-* Install the required python dependencies. Either use requirements.txt or simpley pip install datasploit. 
+* Install dependencies
+    - Python dependencies: Either use requirements.txt (`pip install -r requirements.txt`) if installed with `git clone` or simply `pip install datasploit` to install everything needed as a library.
+    - System dependencies: PhantomJS, Chrome or Firefox needs to be installed for certain modules to render javascript heavy websites.
 * Feeding specific API keys for few specific sources. We are going to have a knowledge base where step by step instructions to generate these API keys will be documented. Sweet deal? 
 * [Click here to check step by step setup guide](/setupGuide/)
 
diff --git a/domain/domain_history.py b/domain/domain_history.py
index ff8fe69c..3124c49f 100755
--- a/domain/domain_history.py
+++ b/domain/domain_history.py
@@ -2,9 +2,10 @@
 
 import base
 import sys
-import requests
 from bs4 import BeautifulSoup
 import re
+from selenium import webdriver
+from selenium.common.exceptions import WebDriverException
 from termcolor import colored
 import time
 
@@ -20,14 +21,31 @@ def netcraft_domain_history(domain):
     ip_history_dict = {}
     time.sleep(0.3)
     endpoint = "http://toolbar.netcraft.com/site_report?url=%s" % (domain)
-    req = requests.get(endpoint)
-
-    soup = BeautifulSoup(req.content, 'html.parser')
-    urls_parsed = soup.findAll('a', href=re.compile(r'.*netblock\?q.*'))
-    for url in urls_parsed:
-        if urls_parsed.index(url) != 0:
-            ip_history_dict[str(url).split('=')[2].split(">")[1].split("<")[0]] = str(url.parent.findNext('td')).strip(
-                "<td>").strip("</td>")
+    # These try's could be in a for loop, but I wanted manual control
+    # over the order in which the webdrivers were chosen.
+    driver = None
+    try:
+        webdriver.PhantomJS()
+        driver = webdriver.PhantomJS()
+    except WebDriverException:
+        try:
+            webdriver.Firefox().quit()
+            driver = webdriver.Firefox()
+        except WebDriverException:
+            try:
+                webdriver.Chrome().quit()
+                driver = webdriver.Chrome()
+            except WebDriverException:
+                ip_history_dict = { 'Error': 'No WebDriver Found!\nTry installing PhantomJS or adding the Chrome or Firefox binaries to your $PATH.'}
+    if driver:
+        driver.get(endpoint)
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'html.parser')
+        urls_parsed = soup.findAll('a', href=re.compile(r'.*netblock\?q.*'))
+        for url in urls_parsed:
+            if urls_parsed.index(url) != 0:
+                ip_history_dict[url['href'].split('=')[1]] = url.get_text()
+        driver.quit()
     return ip_history_dict
 
 
@@ -40,8 +58,16 @@ def main(domain):
 
 
 def output(data, domain=""):
-    for x in data.keys():
-        print "%s: %s" % (data[x], x)
+    if len(data.keys()) > 0:
+        for x in data.keys():
+            if 'Error' in x:
+                print data[x]
+                data[x] = ''
+            else:
+                print "%s: %s" % (data[x], x)
+    else:
+        print colored(style.BOLD + '\n[!] No previous domain owners found!\n' +
+                      style.END, 'red')
     print "\n-----------------------------\n"
 
 
diff --git a/requirements.txt b/requirements.txt
index 3371c785..e9fc34eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,6 +22,7 @@ python-whois
 pytz
 requests
 requests-file
+selenium
 simplejson
 termcolor
 tld