Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make chardet/charset_normalizer optional dependencies #5875

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 3 additions & 28 deletions requests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,7 @@
import warnings
from .exceptions import RequestsDependencyWarning

try:
from charset_normalizer import __version__ as charset_normalizer_version
except ImportError:
charset_normalizer_version = None

try:
from chardet import __version__ as chardet_version
except ImportError:
chardet_version = None

def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version):
def check_compatibility(urllib3_version):
urllib3_version = urllib3_version.split('.')
assert urllib3_version != ['dev'] # Verify urllib3 isn't installed from git.

Expand All @@ -70,20 +60,6 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver
assert minor >= 21
assert minor <= 26

# Check charset_normalizer for compatibility.
if chardet_version:
major, minor, patch = chardet_version.split('.')[:3]
major, minor, patch = int(major), int(minor), int(patch)
# chardet_version >= 3.0.2, < 5.0.0
assert (3, 0, 2) <= (major, minor, patch) < (5, 0, 0)
elif charset_normalizer_version:
major, minor, patch = charset_normalizer_version.split('.')[:3]
major, minor, patch = int(major), int(minor), int(patch)
# charset_normalizer >= 2.0.0 < 3.0.0
assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0)
else:
raise Exception("You need either charset_normalizer or chardet installed")

def _check_cryptography(cryptography_version):
# cryptography < 1.3.4
try:
Expand All @@ -97,10 +73,9 @@ def _check_cryptography(cryptography_version):

# Check imported dependencies for compatibility.
try:
check_compatibility(urllib3.__version__, chardet_version, charset_normalizer_version)
check_compatibility(urllib3.__version__)
except (AssertionError, ValueError):
warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
"version!".format(urllib3.__version__, chardet_version, charset_normalizer_version),
warnings.warn("urllib3 ({}) doesn't match a supported version!".format(urllib3.__version__),
RequestsDependencyWarning)

# Attempt to enable urllib3's fallback for SNI support
Expand Down
9 changes: 8 additions & 1 deletion requests/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
try:
import chardet
except ImportError:
import charset_normalizer as chardet
try:
import charset_normalizer as chardet
import warnings

warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer')
except ImportError:
chardet = None


import sys

Expand Down
22 changes: 20 additions & 2 deletions requests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,17 @@ def next(self):
@property
def apparent_encoding(self):
"""The apparent encoding, provided by the charset_normalizer or chardet libraries."""
return chardet.detect(self.content)['encoding']
# If chardet/charset_normalizer is available, use it.
if chardet:
return chardet.detect(self.content)['encoding']
# Fall back to trying simpler, dumber means.
for encoding in ("ascii", "utf-8"):
try:
self.content.decode(encoding, "strict")
return encoding
except UnicodeDecodeError:
pass
raise ContentDecodingError("Unable to detect response encoding")

def iter_content(self, chunk_size=1, decode_unicode=False):
"""Iterates over the response data. When stream=True is set on the
Expand Down Expand Up @@ -862,7 +872,15 @@ def text(self):

# Fallback to auto-detected encoding.
if self.encoding is None:
encoding = self.apparent_encoding
try:
encoding = self.apparent_encoding
except ContentDecodingError:
raise ContentDecodingError(
"Unable to automatically detect the response's encoding. "
"If you know the response's encoding, you can set it manually (`.encoding`), or "
"install either the `chardet` or `charset_normalizer` library to make automatic "
"detection smarter."
)

# Decode unicode from given encoding.
try:
Expand Down
17 changes: 6 additions & 11 deletions requests/packages.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
import sys

try:
import chardet
except ImportError:
import charset_normalizer as chardet
import warnings

warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer')
from requests.compat import chardet

# This code exists for backwards compatibility reasons.
# I don't like it either. Just look the other way. :)
Expand All @@ -19,8 +13,9 @@
if mod == package or mod.startswith(package + '.'):
sys.modules['requests.packages.' + mod] = sys.modules[mod]

target = chardet.__name__
for mod in list(sys.modules):
if mod == target or mod.startswith(target + '.'):
sys.modules['requests.packages.' + target.replace(target, 'chardet')] = sys.modules[mod]
if chardet:
target = chardet.__name__
for mod in list(sys.modules):
if mod == target or mod.startswith(target + '.'):
sys.modules['requests.packages.' + target.replace(target, 'chardet')] = sys.modules[mod]
# Kinda cool, though, right?
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def run_tests(self):
packages = ['requests']

requires = [
'charset_normalizer~=2.0.0; python_version >= "3"',
'chardet>=3.0.2,<5; python_version < "3"',
'idna>=2.5,<3; python_version < "3"',
'idna>=2.5,<4; python_version >= "3"',
'urllib3>=1.21.1,<1.27',
Expand Down Expand Up @@ -104,7 +102,8 @@ def run_tests(self):
'security': [],
'socks': ['PySocks>=1.5.6, !=1.5.7'],
'socks:sys_platform == "win32" and python_version == "2.7"': ['win_inet_pton'],
'use_chardet_on_py3': ['chardet>=3.0.2,<5']
'chardet': ['chardet>=3.0.2,<5'],
'charset_normalizer': ['charset_normalizer~=2.0.0; python_version >= "3"'],
},
project_urls={
'Documentation': 'https://requests.readthedocs.io',
Expand Down
69 changes: 69 additions & 0 deletions tests/test_testserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,77 @@ def test_text_response(self):

assert r.status_code == 200
assert r.text == u'roflol'
assert not r.encoding
assert r.apparent_encoding == 'ascii'
assert r.headers['Content-Length'] == '6'

def test_text_response_utf_8(self, mocker):
"""
test `.apparent_encoding` is able to infer UTF-8
"""
mocker.patch('requests.models.chardet', new=None)
response_unicode = u"Törkylempijävongahdus"
response_length = len(response_unicode.encode("utf-8"))
# `text_response_server` takes care of encoding to UTF-8 internally
server = Server.text_response_server((
u"HTTP/1.1 200 OK\r\n"
"Content-Length: {}\r\n"
"\r\n"
"{}"
).format(response_length, response_unicode))

with server as (host, port):
r = requests.get('http://{}:{}'.format(host, port))

assert r.status_code == 200
assert r.text == response_unicode
assert not r.encoding
assert r.apparent_encoding == 'utf-8'
assert r.headers['Content-Length'] == str(response_length)

@pytest.mark.parametrize('text, encoding', [
(u"Törkylempijävongahdus", 'utf_16_le'),
(u"Törkylempijävongahdus", 'utf_16_be'),
(u"Törkylempijävongahdus", 'latin1'),
(u"В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!", 'koi8_r'),
(u"テストテキスト", 'shift_jis'),
])
@pytest.mark.parametrize('with_chardet', (False, True))
def test_text_response_esoteric(self, mocker, encoding, text, with_chardet):
"""
test `.apparent_encoding` croaks on a more esoteric encoding when chardet is not available
"""
if not with_chardet:
mocker.patch('requests.models.chardet', new=None)
else:
from requests.compat import chardet
if not chardet:
pytest.skip("chardet not available")
response_bytes = text.encode(encoding)
response_length = len(response_bytes)
response_header = (
"HTTP/1.1 200 OK\r\n"
"Content-Length: {}\r\n"
"\r\n"
).format(response_length).encode()
server = Server.response_server(response_header + response_bytes)

with server as (host, port):
r = requests.get('http://{}:{}'.format(host, port))
assert r.status_code == 200
assert r.headers['Content-Length'] == str(response_length)
assert not r.encoding
if with_chardet:
assert r.apparent_encoding
assert r.text
# We can't assert that `r.text == text`, because it simply might not be
# correctly decoded by either chardet library.
else:
with pytest.raises(requests.exceptions.ContentDecodingError):
assert r.text
with pytest.raises(requests.exceptions.ContentDecodingError):
assert r.apparent_encoding

def test_basic_response(self):
"""the basic response server returns an empty http response"""
with Server.basic_response_server() as (host, port):
Expand Down
15 changes: 11 additions & 4 deletions tests/testserver/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,22 @@ def __init__(self, handler=None, host='localhost', port=0, requests_to_handle=1,
self.stop_event = threading.Event()

@classmethod
def text_response_server(cls, text, request_timeout=0.5, **kwargs):
def text_response_handler(sock):
def response_server(cls, response, request_timeout=0.5, **kwargs):
def response_handler(sock):
request_content = consume_socket_content(sock, timeout=request_timeout)
sock.send(text.encode('utf-8'))
sock.send(response)

return request_content

return Server(response_handler, **kwargs)

return Server(text_response_handler, **kwargs)
@classmethod
def text_response_server(cls, text, request_timeout=0.5, **kwargs):
return cls.response_server(
response=text.encode('utf-8'),
request_timeout=request_timeout,
**kwargs
)

@classmethod
def basic_response_server(cls, **kwargs):
Expand Down
14 changes: 11 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
[tox]
envlist = py{27,36,37,38,39}-{default,use_chardet_on_py3}
envlist =
py27-{chardet,default}
py{36,37,38,39}-{default,chardet,charset_normalizer}

[testenv]
deps = -rrequirements-dev.txt
Expand All @@ -11,8 +13,14 @@ commands =

[testenv:default]

[testenv:use_chardet_on_py3]
[testenv:chardet]
extras =
security
socks
use_chardet_on_py3
chardet

[testenv:charset_normalizer]
extras =
security
socks
charset_normalizer