diff --git a/lib/cfscrape/__init__.py b/lib/cfscrape/__init__.py index eed686a1b3..94bf8ac27d 100644 --- a/lib/cfscrape/__init__.py +++ b/lib/cfscrape/__init__.py @@ -1,30 +1,43 @@ +import json import logging import random import re +import ssl import subprocess import copy import time +import os +from base64 import b64encode +from collections import OrderedDict from requests.sessions import Session +from requests.compat import urlparse, urlunparse +from requests.exceptions import RequestException -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse -__version__ = "1.9.7" +__version__ = "2.0.3" -DEFAULT_USER_AGENTS = [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36", - "Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0", - "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" -] +USER_AGENTS_PATH = os.path.join(os.path.dirname(__file__), "user_agents.json") -DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS) +with open(USER_AGENTS_PATH) as f: + user_agents = json.load(f) + +DEFAULT_USER_AGENT = random.choice(user_agents) + +DEFAULT_HEADERS = OrderedDict( + ( + ("Host", None), + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("User-Agent", DEFAULT_USER_AGENT), + ( + "Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + ), + ("Accept-Language", "en-US,en;q=0.9"), + ("Accept-Encoding", "gzip, deflate"), + ) +) BUG_REPORT = """\ Cloudflare may have changed their technique, or there may be a bug in the script. @@ -43,32 +56,69 @@ https://github.com/Anorov/cloudflare-scrape/issues\ """ + +class CloudflareError(RequestException): + pass + + +class CloudflareCaptchaError(CloudflareError): + pass + + class CloudflareScraper(Session): def __init__(self, *args, **kwargs): - self.delay = kwargs.pop("delay", 8) + self.delay = kwargs.pop("delay", None) + # Use headers with a random User-Agent if no custom headers have been set + headers = OrderedDict(kwargs.pop("headers", DEFAULT_HEADERS)) + + # Set the User-Agent header if it was not provided + headers.setdefault("User-Agent", DEFAULT_USER_AGENT) + super(CloudflareScraper, self).__init__(*args, **kwargs) - if "requests" in self.headers["User-Agent"]: - # Set a random User-Agent if no custom User-Agent has been set - self.headers["User-Agent"] = DEFAULT_USER_AGENT + # Define headers to force using an OrderedDict and preserve header order + self.headers = headers - def is_cloudflare_challenge(self, resp): + @staticmethod + def is_cloudflare_iuam_challenge(resp): return ( - resp.status_code == 503 + resp.status_code in (503, 429) and resp.headers.get("Server", "").startswith("cloudflare") and b"jschl_vc" in resp.content and b"jschl_answer" in resp.content ) + @staticmethod + def is_cloudflare_captcha_challenge(resp): + return ( + resp.status_code == 403 + and resp.headers.get("Server", "").startswith("cloudflare") + and b"/cdn-cgi/l/chk_captcha" in resp.content + ) + def request(self, method, url, *args, **kwargs): resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) - # Check if Cloudflare anti-bot is on - if self.is_cloudflare_challenge(resp): + # Check if Cloudflare captcha challenge is presented + if self.is_cloudflare_captcha_challenge(resp): + self.handle_captcha_challenge(resp, url) + + # Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled + if self.is_cloudflare_iuam_challenge(resp): resp = self.solve_cf_challenge(resp, **kwargs) return resp + def handle_captcha_challenge(self, resp, url): + error = ( + "Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)" + % urlparse(url).netloc + ) + if ssl.OPENSSL_VERSION_NUMBER < 0x10101000: + error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python." + + raise CloudflareCaptchaError(error, response=resp) + def solve_cf_challenge(self, resp, **original_kwargs): start_time = time.time() @@ -78,23 +128,31 @@ def solve_cf_challenge(self, resp, **original_kwargs): submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) cloudflare_kwargs = copy.deepcopy(original_kwargs) - params = cloudflare_kwargs.setdefault("params", {}) + headers = cloudflare_kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: - params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) - params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) - params["s"] = re.search(r'name="s"\svalue="(?P[^"]+)', body).group('s_value') + params = cloudflare_kwargs["params"] = OrderedDict( + re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body) + ) + + for k in ("jschl_vc", "pass"): + if k not in params: + raise ValueError("%s is missing from challenge form" % k) except Exception as e: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot # technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. - raise ValueError("Unable to parse Cloudflare anti-bots page: %s %s" % (e.message, BUG_REPORT)) + raise ValueError( + "Unable to parse Cloudflare anti-bot IUAM page: %s %s" + % (e.message, BUG_REPORT) + ) # Solve the Javascript challenge - params["jschl_answer"] = self.solve_challenge(body, domain) + answer, delay = self.solve_challenge(body, domain) + params["jschl_answer"] = answer # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for @@ -102,44 +160,104 @@ def solve_cf_challenge(self, resp, **original_kwargs): method = resp.request.method cloudflare_kwargs["allow_redirects"] = False - end_time = time.time() - time.sleep(self.delay - (end_time - start_time)) # Cloudflare requires a delay before solving the challenge + # Cloudflare requires a delay before solving the challenge + time.sleep(max(delay - (time.time() - start_time), 0)) + # Send the challenge response and handle the redirect manually redirect = self.request(method, submit_url, **cloudflare_kwargs) - redirect_location = urlparse(redirect.headers["Location"]) + if not redirect_location.netloc: - redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path) + redirect_url = urlunparse( + ( + parsed_url.scheme, + domain, + redirect_location.path, + redirect_location.params, + redirect_location.query, + redirect_location.fragment, + ) + ) return self.request(method, redirect_url, **original_kwargs) return self.request(method, redirect.headers["Location"], **original_kwargs) def solve_challenge(self, body, domain): try: - js = re.search(r"setTimeout\(function\(\){\s+(var " - "s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1) + challenge, ms = re.search( + r"setTimeout\(function\(\){\s*(var " + r"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value\s*=.+?)\r?\n" + r"(?:[^{<>]*},\s*(\d{4,}))?", + body, + ).groups() + + # The challenge requires `document.getElementById` to get this content. + # Future proofing would require escaping newlines and double quotes + innerHTML = re.search(r"]*)? id=\"cf-dn.*?\">([^<>]*)", body) + innerHTML = innerHTML.group(1) if innerHTML else "" + + # Prefix the challenge with a fake document object. + # Interpolate the domain, div contents, and JS challenge. + # The `a.value` to be returned is tacked onto the end. + challenge = """ + var document = { + createElement: function () { + return { firstChild: { href: "http://%s/" } } + }, + getElementById: function () { + return {"innerHTML": "%s"}; + } + }; + + %s; a.value + """ % ( + domain, + innerHTML, + challenge, + ) + # Encode the challenge for security while preserving quotes and spacing. + challenge = b64encode(challenge.encode("utf-8")).decode("ascii") + # Use the provided delay, parsed delay, or default to 8 secs + delay = self.delay or (float(ms) / float(1000) if ms else 8) except Exception: - raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT) - - js = re.sub(r"a\.value = (.+ \+ t\.length(\).toFixed\(10\))?).+", r"\1", js) - js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain))) - - # Strip characters that could be used to exit the string context - # These characters are not currently used in Cloudflare's arithmetic snippet - js = re.sub(r"[\n\\']", "", js) - - if "toFixed" not in js: - raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT) + raise ValueError( + "Unable to identify Cloudflare IUAM Javascript on website. %s" + % BUG_REPORT + ) # Use vm.runInNewContext to safely evaluate code # The sandboxed code cannot use the Node.js standard library - js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js + js = ( + """\ + var atob = Object.setPrototypeOf(function (str) {\ + try {\ + return Buffer.from("" + str, "base64").toString("binary");\ + } catch (e) {}\ + }, null);\ + var challenge = atob("%s");\ + var context = Object.setPrototypeOf({ atob: atob }, null);\ + var options = {\ + filename: "iuam-challenge.js",\ + contextOrigin: "cloudflare:iuam-challenge.js",\ + contextCodeGeneration: { strings: true, wasm: false },\ + timeout: 5000\ + };\ + process.stdout.write(String(\ + require("vm").runInNewContext(challenge, context, options)\ + ));\ + """ + % challenge + ) try: - result = subprocess.check_output(["node", "-e", js]).strip() + result = subprocess.check_output( + ["node", "-e", js], stdin=subprocess.PIPE, stderr=subprocess.PIPE + ) except OSError as e: if e.errno == 2: - raise EnvironmentError("Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape" - " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") + raise EnvironmentError( + "Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape" + " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies." + ) raise except Exception: logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT) @@ -148,9 +266,11 @@ def solve_challenge(self, body, domain): try: float(result) except Exception: - raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT) + raise ValueError( + "Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT + ) - return result + return result, delay @classmethod def create_scraper(cls, sess=None, **kwargs): @@ -160,7 +280,16 @@ def create_scraper(cls, sess=None, **kwargs): scraper = cls(**kwargs) if sess: - attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"] + attrs = [ + "auth", + "cert", + "cookies", + "headers", + "hooks", + "params", + "proxies", + "data", + ] for attr in attrs: val = getattr(sess, attr, None) if val: @@ -168,7 +297,6 @@ def create_scraper(cls, sess=None, **kwargs): return scraper - ## Functions for integrating cloudflare-scrape with other applications and scripts @classmethod @@ -180,7 +308,7 @@ def get_tokens(cls, url, user_agent=None, **kwargs): try: resp = scraper.get(url, **kwargs) resp.raise_for_status() - except Exception as e: + except Exception: logging.error("'%s' returned an error. Could not collect tokens." % url) raise @@ -192,14 +320,19 @@ def get_tokens(cls, url, user_agent=None, **kwargs): cookie_domain = d break else: - raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") + raise ValueError( + 'Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?' + ) - return ({ - "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain), - "cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain) - }, - scraper.headers["User-Agent"] - ) + return ( + { + "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain), + "cf_clearance": scraper.cookies.get( + "cf_clearance", "", domain=cookie_domain + ), + }, + scraper.headers["User-Agent"], + ) @classmethod def get_cookie_string(cls, url, user_agent=None, **kwargs): @@ -209,6 +342,7 @@ def get_cookie_string(cls, url, user_agent=None, **kwargs): tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs) return "; ".join("=".join(pair) for pair in tokens.items()), user_agent + create_scraper = CloudflareScraper.create_scraper get_tokens = CloudflareScraper.get_tokens -get_cookie_string = CloudflareScraper.get_cookie_string +get_cookie_string = CloudflareScraper.get_cookie_string \ No newline at end of file diff --git a/lib/cfscrape/user_agents.json b/lib/cfscrape/user_agents.json new file mode 100644 index 0000000000..0d46b97fb4 --- /dev/null +++ b/lib/cfscrape/user_agents.json @@ -0,0 +1,32 @@ +[ +"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36", +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36" +] \ No newline at end of file