Skip to content

Commit

Permalink
Bump cfscrape version to 2.0.0
Browse files Browse the repository at this point in the history
Includes latest challenge update: Anorov/cloudflare-scrape#234
  • Loading branch information
CodyWoolaver authored and Cody Woolaver committed May 13, 2019
1 parent b3da866 commit 722ad62
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 63 deletions.
260 changes: 197 additions & 63 deletions lib/cfscrape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,43 @@
import json
import logging
import random
import re
import ssl
import subprocess
import copy
import time
import os
from base64 import b64encode
from collections import OrderedDict

from requests.sessions import Session
from requests.compat import urlparse, urlunparse
from requests.exceptions import RequestException

try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse

__version__ = "1.9.7"
__version__ = "2.0.3"

DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
]
USER_AGENTS_PATH = os.path.join(os.path.dirname(__file__), "user_agents.json")

DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
with open(USER_AGENTS_PATH) as f:
user_agents = json.load(f)

DEFAULT_USER_AGENT = random.choice(user_agents)

DEFAULT_HEADERS = OrderedDict(
(
("Host", None),
("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"),
("User-Agent", DEFAULT_USER_AGENT),
(
"Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
),
("Accept-Language", "en-US,en;q=0.9"),
("Accept-Encoding", "gzip, deflate"),
)
)

BUG_REPORT = """\
Cloudflare may have changed their technique, or there may be a bug in the script.
Expand All @@ -43,32 +56,69 @@
https://github.com/Anorov/cloudflare-scrape/issues\
"""


class CloudflareError(RequestException):
pass


class CloudflareCaptchaError(CloudflareError):
pass


class CloudflareScraper(Session):
def __init__(self, *args, **kwargs):
self.delay = kwargs.pop("delay", 8)
self.delay = kwargs.pop("delay", None)
# Use headers with a random User-Agent if no custom headers have been set
headers = OrderedDict(kwargs.pop("headers", DEFAULT_HEADERS))

# Set the User-Agent header if it was not provided
headers.setdefault("User-Agent", DEFAULT_USER_AGENT)

super(CloudflareScraper, self).__init__(*args, **kwargs)

if "requests" in self.headers["User-Agent"]:
# Set a random User-Agent if no custom User-Agent has been set
self.headers["User-Agent"] = DEFAULT_USER_AGENT
# Define headers to force using an OrderedDict and preserve header order
self.headers = headers

def is_cloudflare_challenge(self, resp):
@staticmethod
def is_cloudflare_iuam_challenge(resp):
return (
resp.status_code == 503
resp.status_code in (503, 429)
and resp.headers.get("Server", "").startswith("cloudflare")
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
)

@staticmethod
def is_cloudflare_captcha_challenge(resp):
return (
resp.status_code == 403
and resp.headers.get("Server", "").startswith("cloudflare")
and b"/cdn-cgi/l/chk_captcha" in resp.content
)

def request(self, method, url, *args, **kwargs):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)

# Check if Cloudflare anti-bot is on
if self.is_cloudflare_challenge(resp):
# Check if Cloudflare captcha challenge is presented
if self.is_cloudflare_captcha_challenge(resp):
self.handle_captcha_challenge(resp, url)

# Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled
if self.is_cloudflare_iuam_challenge(resp):
resp = self.solve_cf_challenge(resp, **kwargs)

return resp

def handle_captcha_challenge(self, resp, url):
error = (
"Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)"
% urlparse(url).netloc
)
if ssl.OPENSSL_VERSION_NUMBER < 0x10101000:
error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python."

raise CloudflareCaptchaError(error, response=resp)

def solve_cf_challenge(self, resp, **original_kwargs):
start_time = time.time()

Expand All @@ -78,68 +128,136 @@ def solve_cf_challenge(self, resp, **original_kwargs):
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)

cloudflare_kwargs = copy.deepcopy(original_kwargs)
params = cloudflare_kwargs.setdefault("params", {})

headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url

try:
params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1)
params["s"] = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body).group('s_value')
params = cloudflare_kwargs["params"] = OrderedDict(
re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body)
)

for k in ("jschl_vc", "pass"):
if k not in params:
raise ValueError("%s is missing from challenge form" % k)
except Exception as e:
# Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot
# technique. If you see this and are running the latest version,
# please open a GitHub issue so I can update the code accordingly.
raise ValueError("Unable to parse Cloudflare anti-bots page: %s %s" % (e.message, BUG_REPORT))
raise ValueError(
"Unable to parse Cloudflare anti-bot IUAM page: %s %s"
% (e.message, BUG_REPORT)
)

# Solve the Javascript challenge
params["jschl_answer"] = self.solve_challenge(body, domain)
answer, delay = self.solve_challenge(body, domain)
params["jschl_answer"] = answer

# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False

end_time = time.time()
time.sleep(self.delay - (end_time - start_time)) # Cloudflare requires a delay before solving the challenge
# Cloudflare requires a delay before solving the challenge
time.sleep(max(delay - (time.time() - start_time), 0))

# Send the challenge response and handle the redirect manually
redirect = self.request(method, submit_url, **cloudflare_kwargs)

redirect_location = urlparse(redirect.headers["Location"])

if not redirect_location.netloc:
redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path)
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment,
)
)
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)

def solve_challenge(self, body, domain):
try:
js = re.search(r"setTimeout\(function\(\){\s+(var "
"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
challenge, ms = re.search(
r"setTimeout\(function\(\){\s*(var "
r"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value\s*=.+?)\r?\n"
r"(?:[^{<>]*},\s*(\d{4,}))?",
body,
).groups()

# The challenge requires `document.getElementById` to get this content.
# Future proofing would require escaping newlines and double quotes
innerHTML = re.search(r"<div(?: [^<>]*)? id=\"cf-dn.*?\">([^<>]*)", body)
innerHTML = innerHTML.group(1) if innerHTML else ""

# Prefix the challenge with a fake document object.
# Interpolate the domain, div contents, and JS challenge.
# The `a.value` to be returned is tacked onto the end.
challenge = """
var document = {
createElement: function () {
return { firstChild: { href: "http://%s/" } }
},
getElementById: function () {
return {"innerHTML": "%s"};
}
};
%s; a.value
""" % (
domain,
innerHTML,
challenge,
)
# Encode the challenge for security while preserving quotes and spacing.
challenge = b64encode(challenge.encode("utf-8")).decode("ascii")
# Use the provided delay, parsed delay, or default to 8 secs
delay = self.delay or (float(ms) / float(1000) if ms else 8)
except Exception:
raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT)

js = re.sub(r"a\.value = (.+ \+ t\.length(\).toFixed\(10\))?).+", r"\1", js)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))

# Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet
js = re.sub(r"[\n\\']", "", js)

if "toFixed" not in js:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
raise ValueError(
"Unable to identify Cloudflare IUAM Javascript on website. %s"
% BUG_REPORT
)

# Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library
js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js
js = (
"""\
var atob = Object.setPrototypeOf(function (str) {\
try {\
return Buffer.from("" + str, "base64").toString("binary");\
} catch (e) {}\
}, null);\
var challenge = atob("%s");\
var context = Object.setPrototypeOf({ atob: atob }, null);\
var options = {\
filename: "iuam-challenge.js",\
contextOrigin: "cloudflare:iuam-challenge.js",\
contextCodeGeneration: { strings: true, wasm: false },\
timeout: 5000\
};\
process.stdout.write(String(\
require("vm").runInNewContext(challenge, context, options)\
));\
"""
% challenge
)

try:
result = subprocess.check_output(["node", "-e", js]).strip()
result = subprocess.check_output(
["node", "-e", js], stdin=subprocess.PIPE, stderr=subprocess.PIPE
)
except OSError as e:
if e.errno == 2:
raise EnvironmentError("Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
raise EnvironmentError(
"Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies."
)
raise
except Exception:
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
Expand All @@ -148,9 +266,11 @@ def solve_challenge(self, body, domain):
try:
float(result)
except Exception:
raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)
raise ValueError(
"Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT
)

return result
return result, delay

@classmethod
def create_scraper(cls, sess=None, **kwargs):
Expand All @@ -160,15 +280,23 @@ def create_scraper(cls, sess=None, **kwargs):
scraper = cls(**kwargs)

if sess:
attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
attrs = [
"auth",
"cert",
"cookies",
"headers",
"hooks",
"params",
"proxies",
"data",
]
for attr in attrs:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)

return scraper


## Functions for integrating cloudflare-scrape with other applications and scripts

@classmethod
Expand All @@ -180,7 +308,7 @@ def get_tokens(cls, url, user_agent=None, **kwargs):
try:
resp = scraper.get(url, **kwargs)
resp.raise_for_status()
except Exception as e:
except Exception:
logging.error("'%s' returned an error. Could not collect tokens." % url)
raise

Expand All @@ -192,14 +320,19 @@ def get_tokens(cls, url, user_agent=None, **kwargs):
cookie_domain = d
break
else:
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
raise ValueError(
'Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?'
)

return ({
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
"cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain)
},
scraper.headers["User-Agent"]
)
return (
{
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
"cf_clearance": scraper.cookies.get(
"cf_clearance", "", domain=cookie_domain
),
},
scraper.headers["User-Agent"],
)

@classmethod
def get_cookie_string(cls, url, user_agent=None, **kwargs):
Expand All @@ -209,6 +342,7 @@ def get_cookie_string(cls, url, user_agent=None, **kwargs):
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
return "; ".join("=".join(pair) for pair in tokens.items()), user_agent


create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string
get_cookie_string = CloudflareScraper.get_cookie_string
Loading

0 comments on commit 722ad62

Please sign in to comment.