From 0712b821f1a677afa0e07bcaa76db4a58c19afaa Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 15 Jul 2021 14:09:01 +0300 Subject: [PATCH 1/4] Move chardet/charset_normalizer imports to .compat and remove version checks --- requests/__init__.py | 31 +++---------------------------- requests/compat.py | 4 ++++ requests/packages.py | 17 ++++++----------- 3 files changed, 13 insertions(+), 39 deletions(-) diff --git a/requests/__init__.py b/requests/__init__.py index 0ac7713b81..737da64a2f 100644 --- a/requests/__init__.py +++ b/requests/__init__.py @@ -44,17 +44,7 @@ import warnings from .exceptions import RequestsDependencyWarning -try: - from charset_normalizer import __version__ as charset_normalizer_version -except ImportError: - charset_normalizer_version = None - -try: - from chardet import __version__ as chardet_version -except ImportError: - chardet_version = None - -def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version): +def check_compatibility(urllib3_version): urllib3_version = urllib3_version.split('.') assert urllib3_version != ['dev'] # Verify urllib3 isn't installed from git. @@ -70,20 +60,6 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver assert minor >= 21 assert minor <= 26 - # Check charset_normalizer for compatibility. - if chardet_version: - major, minor, patch = chardet_version.split('.')[:3] - major, minor, patch = int(major), int(minor), int(patch) - # chardet_version >= 3.0.2, < 5.0.0 - assert (3, 0, 2) <= (major, minor, patch) < (5, 0, 0) - elif charset_normalizer_version: - major, minor, patch = charset_normalizer_version.split('.')[:3] - major, minor, patch = int(major), int(minor), int(patch) - # charset_normalizer >= 2.0.0 < 3.0.0 - assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0) - else: - raise Exception("You need either charset_normalizer or chardet installed") - def _check_cryptography(cryptography_version): # cryptography < 1.3.4 try: @@ -97,10 +73,9 @@ def _check_cryptography(cryptography_version): # Check imported dependencies for compatibility. try: - check_compatibility(urllib3.__version__, chardet_version, charset_normalizer_version) + check_compatibility(urllib3.__version__) except (AssertionError, ValueError): - warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " - "version!".format(urllib3.__version__, chardet_version, charset_normalizer_version), + warnings.warn("urllib3 ({}) doesn't match a supported version!".format(urllib3.__version__), RequestsDependencyWarning) # Attempt to enable urllib3's fallback for SNI support diff --git a/requests/compat.py b/requests/compat.py index 0b14f5015c..ee28551b8f 100644 --- a/requests/compat.py +++ b/requests/compat.py @@ -12,6 +12,10 @@ import chardet except ImportError: import charset_normalizer as chardet + import warnings + + warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer') + import sys diff --git a/requests/packages.py b/requests/packages.py index 00196bff25..9d7b02cca3 100644 --- a/requests/packages.py +++ b/requests/packages.py @@ -1,12 +1,6 @@ import sys -try: - import chardet -except ImportError: - import charset_normalizer as chardet - import warnings - - warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer') +from requests.compat import chardet # This code exists for backwards compatibility reasons. # I don't like it either. Just look the other way. :) @@ -19,8 +13,9 @@ if mod == package or mod.startswith(package + '.'): sys.modules['requests.packages.' + mod] = sys.modules[mod] -target = chardet.__name__ -for mod in list(sys.modules): - if mod == target or mod.startswith(target + '.'): - sys.modules['requests.packages.' + target.replace(target, 'chardet')] = sys.modules[mod] +if chardet: + target = chardet.__name__ + for mod in list(sys.modules): + if mod == target or mod.startswith(target + '.'): + sys.modules['requests.packages.' + target.replace(target, 'chardet')] = sys.modules[mod] # Kinda cool, though, right? From cf3fbf0fff7066e97d23814c03cfe67f2e4be7f9 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 15 Jul 2021 14:18:30 +0300 Subject: [PATCH 2/4] Make chardet/charset_normalizer an optional dependency See #5871 --- requests/compat.py | 9 ++++++--- requests/models.py | 22 ++++++++++++++++++++-- tests/test_testserver.py | 26 ++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/requests/compat.py b/requests/compat.py index ee28551b8f..882e7abbd4 100644 --- a/requests/compat.py +++ b/requests/compat.py @@ -11,10 +11,13 @@ try: import chardet except ImportError: - import charset_normalizer as chardet - import warnings + try: + import charset_normalizer as chardet + import warnings - warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer') + warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer') + except ImportError: + chardet = None import sys diff --git a/requests/models.py b/requests/models.py index aa6fb86e4e..aaf26fdcf2 100644 --- a/requests/models.py +++ b/requests/models.py @@ -732,7 +732,17 @@ def next(self): @property def apparent_encoding(self): """The apparent encoding, provided by the charset_normalizer or chardet libraries.""" - return chardet.detect(self.content)['encoding'] + # If chardet/charset_normalizer is available, use it. + if chardet: + return chardet.detect(self.content)['encoding'] + # Fall back to trying simpler, dumber means. + for encoding in ("ascii", "utf-8"): + try: + self.content.decode(encoding, "strict") + return encoding + except UnicodeDecodeError: + pass + raise ContentDecodingError("Unable to detect response encoding") def iter_content(self, chunk_size=1, decode_unicode=False): """Iterates over the response data. When stream=True is set on the @@ -862,7 +872,15 @@ def text(self): # Fallback to auto-detected encoding. if self.encoding is None: - encoding = self.apparent_encoding + try: + encoding = self.apparent_encoding + except ContentDecodingError: + raise ContentDecodingError( + "Unable to automatically detect the response's encoding. " + "If you know the response's encoding, you can set it manually (`.encoding`), or " + "install either the `chardet` or `charset_normalizer` library to make automatic " + "detection smarter." + ) # Decode unicode from given encoding. try: diff --git a/tests/test_testserver.py b/tests/test_testserver.py index aac529261b..cd60a44d6e 100644 --- a/tests/test_testserver.py +++ b/tests/test_testserver.py @@ -54,8 +54,34 @@ def test_text_response(self): assert r.status_code == 200 assert r.text == u'roflol' + assert not r.encoding + assert r.apparent_encoding == 'ascii' assert r.headers['Content-Length'] == '6' + def test_text_response_utf_8(self, mocker): + """ + test `.apparent_encoding` is able to infer UTF-8 + """ + mocker.patch('requests.models.chardet', new=None) + response_unicode = u"Törkylempijävongahdus" + response_length = len(response_unicode.encode("utf-8")) + # `text_response_server` takes care of encoding to UTF-8 internally + server = Server.text_response_server(( + u"HTTP/1.1 200 OK\r\n" + "Content-Length: {}\r\n" + "\r\n" + "{}" + ).format(response_length, response_unicode)) + + with server as (host, port): + r = requests.get('http://{}:{}'.format(host, port)) + + assert r.status_code == 200 + assert r.text == response_unicode + assert not r.encoding + assert r.apparent_encoding == 'utf-8' + assert r.headers['Content-Length'] == str(response_length) + def test_basic_response(self): """the basic response server returns an empty http response""" with Server.basic_response_server() as (host, port): From 03ee56953ef772bad766da17b084e347c2c9db40 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 15 Jul 2021 14:10:02 +0300 Subject: [PATCH 3/4] Make chardet/charset_normalizer extras, not required dependencies --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index ce5e5c8009..011038547e 100755 --- a/setup.py +++ b/setup.py @@ -41,8 +41,6 @@ def run_tests(self): packages = ['requests'] requires = [ - 'charset_normalizer~=2.0.0; python_version >= "3"', - 'chardet>=3.0.2,<5; python_version < "3"', 'idna>=2.5,<3; python_version < "3"', 'idna>=2.5,<4; python_version >= "3"', 'urllib3>=1.21.1,<1.27', @@ -104,7 +102,8 @@ def run_tests(self): 'security': [], 'socks': ['PySocks>=1.5.6, !=1.5.7'], 'socks:sys_platform == "win32" and python_version == "2.7"': ['win_inet_pton'], - 'use_chardet_on_py3': ['chardet>=3.0.2,<5'] + 'chardet': ['chardet>=3.0.2,<5'], + 'charset_normalizer': ['charset_normalizer~=2.0.0; python_version >= "3"'], }, project_urls={ 'Documentation': 'https://requests.readthedocs.io', From 2d46ddbcf10e214fce925fe6fcf9cccbada976aa Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 15 Jul 2021 14:55:40 +0300 Subject: [PATCH 4/4] Add tests to check against false positives for more esoteric encodings --- tests/test_testserver.py | 43 ++++++++++++++++++++++++++++++++++++++ tests/testserver/server.py | 15 +++++++++---- tox.ini | 14 ++++++++++--- 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/tests/test_testserver.py b/tests/test_testserver.py index cd60a44d6e..ba39153dec 100644 --- a/tests/test_testserver.py +++ b/tests/test_testserver.py @@ -82,6 +82,49 @@ def test_text_response_utf_8(self, mocker): assert r.apparent_encoding == 'utf-8' assert r.headers['Content-Length'] == str(response_length) + @pytest.mark.parametrize('text, encoding', [ + (u"Törkylempijävongahdus", 'utf_16_le'), + (u"Törkylempijävongahdus", 'utf_16_be'), + (u"Törkylempijävongahdus", 'latin1'), + (u"В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!", 'koi8_r'), + (u"テストテキスト", 'shift_jis'), + ]) + @pytest.mark.parametrize('with_chardet', (False, True)) + def test_text_response_esoteric(self, mocker, encoding, text, with_chardet): + """ + test `.apparent_encoding` croaks on a more esoteric encoding when chardet is not available + """ + if not with_chardet: + mocker.patch('requests.models.chardet', new=None) + else: + from requests.compat import chardet + if not chardet: + pytest.skip("chardet not available") + response_bytes = text.encode(encoding) + response_length = len(response_bytes) + response_header = ( + "HTTP/1.1 200 OK\r\n" + "Content-Length: {}\r\n" + "\r\n" + ).format(response_length).encode() + server = Server.response_server(response_header + response_bytes) + + with server as (host, port): + r = requests.get('http://{}:{}'.format(host, port)) + assert r.status_code == 200 + assert r.headers['Content-Length'] == str(response_length) + assert not r.encoding + if with_chardet: + assert r.apparent_encoding + assert r.text + # We can't assert that `r.text == text`, because it simply might not be + # correctly decoded by either chardet library. + else: + with pytest.raises(requests.exceptions.ContentDecodingError): + assert r.text + with pytest.raises(requests.exceptions.ContentDecodingError): + assert r.apparent_encoding + def test_basic_response(self): """the basic response server returns an empty http response""" with Server.basic_response_server() as (host, port): diff --git a/tests/testserver/server.py b/tests/testserver/server.py index 132221f7c4..684b5c23da 100644 --- a/tests/testserver/server.py +++ b/tests/testserver/server.py @@ -42,15 +42,22 @@ def __init__(self, handler=None, host='localhost', port=0, requests_to_handle=1, self.stop_event = threading.Event() @classmethod - def text_response_server(cls, text, request_timeout=0.5, **kwargs): - def text_response_handler(sock): + def response_server(cls, response, request_timeout=0.5, **kwargs): + def response_handler(sock): request_content = consume_socket_content(sock, timeout=request_timeout) - sock.send(text.encode('utf-8')) + sock.send(response) return request_content + return Server(response_handler, **kwargs) - return Server(text_response_handler, **kwargs) + @classmethod + def text_response_server(cls, text, request_timeout=0.5, **kwargs): + return cls.response_server( + response=text.encode('utf-8'), + request_timeout=request_timeout, + **kwargs + ) @classmethod def basic_response_server(cls, **kwargs): diff --git a/tox.ini b/tox.ini index 5e3d53774e..30861a08ae 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,7 @@ [tox] -envlist = py{27,36,37,38,39}-{default,use_chardet_on_py3} +envlist = + py27-{chardet,default} + py{36,37,38,39}-{default,chardet,charset_normalizer} [testenv] deps = -rrequirements-dev.txt @@ -11,8 +13,14 @@ commands = [testenv:default] -[testenv:use_chardet_on_py3] +[testenv:chardet] extras = security socks - use_chardet_on_py3 + chardet + +[testenv:charset_normalizer] +extras = + security + socks + charset_normalizer