Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop chardet #1269

Merged
merged 12 commits into from
Sep 15, 2020
17 changes: 15 additions & 2 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,27 @@ HTTPX will automatically handle decoding the response content into Unicode text.
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
```

You can inspect what encoding has been used to decode the response.
You can inspect what encoding will be used to decode the response.

```pycon
>>> r.encoding
'UTF-8'
```

If you need to override the standard behavior and explicitly set the encoding to
In some cases the response may not contain an explicit encoding, in which case HTTPX
will attempt to automatically determine an encoding to use. This defaults to
UTF-8, but also includes robust fallback behaviour for handling ascii,
iso-8859-1 and windows 1252 encodings.

```pycon
>>> r.encoding
None
>>> r.text
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
```


If you need to override the standard behaviour and explicitly set the encoding to
use, then you can do that too.

```pycon
Expand Down
94 changes: 41 additions & 53 deletions httpx/_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import typing
import zlib

import chardet

try:
import brotli
except ImportError: # pragma: nocover
Expand Down Expand Up @@ -163,62 +161,52 @@ class TextDecoder:
"""

def __init__(self, encoding: typing.Optional[str] = None):
self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
None if encoding is None else codecs.getincrementaldecoder(encoding)()
)
self.detector = chardet.universaldetector.UniversalDetector()

# This buffer is only needed if 'decoder' is 'None'
# we want to trigger errors if data is getting added to
# our internal buffer for some silly reason while
# a decoder is discovered.
self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
if encoding is not None:
self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")

def decode(self, data: bytes) -> str:
try:
if self.decoder is not None:
text = self.decoder.decode(data)
else:
assert self.buffer is not None
text = ""
self.detector.feed(data)
self.buffer += data

# Should be more than enough data to process, we don't
# want to buffer too long as chardet will wait until
# detector.close() is used to give back common
# encodings like 'utf-8'.
if len(self.buffer) >= 4096:
self.decoder = codecs.getincrementaldecoder(
self._detector_result()
)()
text = self.decoder.decode(bytes(self.buffer), False)
self.buffer = None

return text
except UnicodeDecodeError as exc: # pragma: nocover
raise ValueError(str(exc))
"""
If an encoding is explicitly specified, then we use that.
Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.

def flush(self) -> str:
try:
if self.decoder is None:
# Empty string case as chardet is guaranteed to not have a guess.
assert self.buffer is not None
if len(self.buffer) == 0:
return ""
return bytes(self.buffer).decode(self._detector_result())

return self.decoder.decode(b"", True)
except UnicodeDecodeError as exc: # pragma: nocover
raise ValueError(str(exc))
Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
superset of the non-control characters in iso-8859-1, so we essentially
end up supporting any of ascii, utf-8, iso-8859-1, cp1252.

def _detector_result(self) -> str:
self.detector.close()
result = self.detector.result["encoding"]
if not result: # pragma: nocover
raise ValueError("Unable to determine encoding of content")
Given that UTF-8 is now by *far* the most widely used encoding, this
should be a pretty robust strategy for cases where a charset has
not been explicitly included.

return result
Useful stats on the prevalence of different charsets in the wild...

* https://w3techs.com/technologies/overview/character_encoding
* https://w3techs.com/technologies/history_overview/character_encoding

The HTML5 spec also has some useful guidelines, suggesting defaults of
either UTF-8 or Windows 1252 in most cases...

* https://dev.w3.org/html5/spec-LC/Overview.html
"""
if self.decoder is None:
# If this is the first decode pass then we need to determine which
# encoding to use by attempting UTF-8 and raising any decode errors.
attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
try:
attempt_utf_8.decode(data)
except UnicodeDecodeError:
# Could not decode as UTF-8. Use Windows 1252.
self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
else:
# Can decode as UTF-8. Use UTF-8 with lenient error settings.
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
Copy link
Contributor

@StephenBrown2 StephenBrown2 Sep 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why ("utf-8")(errors="replace") here if it passed with ("utf-8")(errors="strict") to get here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need strict to raise an error if it doesn't appear to decode as UTF-8, but once we've made the decision we use errors="replace" for the most robust behaviour possible. So eg. if we've got a streaming response that initially appears to be UTF-8, but later has some non-UTF-8 bytes, then we're not raising a hard error on accessing .text.

(We'd like it to have a failure mode that is as graceful as possible.)


return self.decoder.decode(data)

def flush(self) -> str:
if self.decoder is None:
return ""
return self.decoder.decode(b"", True)


class LineDecoder:
Expand Down
44 changes: 16 additions & 28 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, quote, unquote, urlencode

import chardet
import rfc3986
import rfc3986.exceptions

Expand Down Expand Up @@ -755,19 +754,22 @@ def text(self) -> str:
if not content:
self._text = ""
else:
encoding = self.encoding
self._text = content.decode(encoding, errors="replace")
decoder = TextDecoder(encoding=self.encoding)
self._text = "".join([decoder.decode(self.content), decoder.flush()])
return self._text

@property
def encoding(self) -> str:
def encoding(self) -> typing.Optional[str]:
"""
Return the encoding, which may have been set explicitly, or may have
been specified by the Content-Type header.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = self.apparent_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = "utf-8"
self._encoding = encoding
self._encoding = None
else:
self._encoding = encoding
return self._encoding

@encoding.setter
Expand All @@ -783,25 +785,11 @@ def charset_encoding(self) -> typing.Optional[str]:
if content_type is None:
return None

parsed = cgi.parse_header(content_type)
media_type, params = parsed[0], parsed[-1]
if "charset" in params:
return params["charset"].strip("'\"")

# RFC 2616 specifies that 'iso-8859-1' should be used as the default
# for 'text/*' media types, if no charset is provided.
# See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
if media_type.startswith("text/"):
return "iso-8859-1"

return None
_, params = cgi.parse_header(content_type)
if "charset" not in params:
return None

@property
def apparent_encoding(self) -> typing.Optional[str]:
"""
Return the encoding, as it appears to autodetection.
"""
return chardet.detect(self.content)["encoding"]
return params["charset"].strip("'\"")

def _get_content_decoder(self) -> ContentDecoder:
"""
Expand Down Expand Up @@ -936,7 +924,7 @@ def iter_text(self) -> typing.Iterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
for chunk in self.iter_bytes():
yield decoder.decode(chunk)
Expand Down Expand Up @@ -1020,7 +1008,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
async for chunk in self.aiter_bytes():
yield decoder.decode(chunk)
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def get_packages(package):
install_requires=[
"certifi",
"sniffio",
"chardet==3.*",
"rfc3986[idna2008]>=1.3,<2",
"httpcore==0.10.*",
],
Expand Down
2 changes: 1 addition & 1 deletion tests/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_get(server):
assert response.content == b"Hello, world!"
assert response.text == "Hello, world!"
assert response.http_version == "HTTP/1.1"
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.request.url == url
assert response.headers
assert response.is_redirect is False
Expand Down
70 changes: 53 additions & 17 deletions tests/models/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,36 +81,36 @@ def test_response_content_type_encoding():

def test_response_autodetect_encoding():
"""
Autodetect encoding if there is no charset info in a Content-Type header.
Autodetect encoding if there is no Content-Type header.
"""
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_fallback_to_autodetect():
"""
Fallback to autodetection if we get an invalid charset in the Content-Type header.
"""
headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_default_text_encoding():
def test_response_no_charset_with_ascii_content():
"""
A media type of 'text/*' with no charset should default to ISO-8859-1.
See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
A response with ascii encoded content should decode correctly,
even with no charset specified.
"""
content = b"Hello, world!"
headers = {"Content-Type": "text/plain"}
Expand All @@ -120,20 +120,56 @@ def test_response_default_text_encoding():
headers=headers,
)
assert response.status_code == 200
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.text == "Hello, world!"


def test_response_default_encoding():
def test_response_no_charset_with_utf8_content():
"""
Default to utf-8 if all else fails.
A response with UTF-8 encoded content should decode correctly,
even with no charset specified.
"""
content = "Unicode Snowman: ☃".encode("utf-8")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=b"",
content=content,
headers=headers,
)
assert response.text == ""
assert response.encoding == "utf-8"
assert response.text == "Unicode Snowman: ☃"
assert response.encoding is None


def test_response_no_charset_with_iso_8859_1_content():
"""
A response with ISO 8859-1 encoded content should decode correctly,
even with no charset specified.
"""
content = "Accented: Österreich".encode("iso-8859-1")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Accented: Österreich"
assert response.encoding is None


def test_response_no_charset_with_cp_1252_content():
"""
A response with Windows 1252 encoded content should decode correctly,
even with no charset specified.
"""
content = "Euro Currency: €".encode("cp1252")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Euro Currency: €"
assert response.encoding is None


def test_response_non_text_encoding():
Expand All @@ -147,7 +183,7 @@ def test_response_non_text_encoding():
headers=headers,
)
assert response.text == "xyz"
assert response.encoding == "ascii"
assert response.encoding is None


def test_response_set_explicit_encoding():
Expand Down Expand Up @@ -184,7 +220,7 @@ def test_read():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = response.read()
Expand All @@ -203,7 +239,7 @@ async def test_aread():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = await response.aread()
Expand Down
12 changes: 2 additions & 10 deletions tests/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,8 @@ def test_decoding_errors(header_value):
[
((b"Hello,", b" world!"), "ascii"),
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
(
(b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
"MacCyrillic",
),
(
(b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
"euc-jp",
),
((b"Euro character: \x88!", b""), "cp1252"),
((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
],
)
@pytest.mark.asyncio
Expand Down