Skip to content

Commit

Permalink
Detect content encoding if invalid charset was specified (#2549)
Browse files Browse the repository at this point in the history
* Add the processing of invalid charsets while detecting content encoding

* Make the `aiohttp.ClientResponse.get_encoding` method public

* Add docs

* Fix tests

* Fix change description

* Update client_reference.rst
  • Loading branch information
decaz authored and asvetlov committed Nov 23, 2017
1 parent e180b12 commit 67eb1e7
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGES/2549.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Make the `aiohttp.ClientResponse.get_encoding` method public with
the processing of invalid charset while detecting content encoding.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Ludovic Gasc
Lukasz Marcin Dobrzanski
Makc Belousow
Manuel Miranda
Marat Sharafutdinov
Marco Paolini
Mariano Anaya
Martin Melka
Expand Down
12 changes: 9 additions & 3 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import codecs
import collections
import io
import json
Expand Down Expand Up @@ -756,11 +757,16 @@ async def read(self):

return self._content

def _get_encoding(self):
def get_encoding(self):
ctype = self.headers.get(hdrs.CONTENT_TYPE, '').lower()
mimetype = helpers.parse_mimetype(ctype)

encoding = mimetype.parameters.get('charset')
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == 'application' and mimetype.subtype == 'json':
# RFC 7159 states that the default encoding is UTF-8.
Expand All @@ -778,7 +784,7 @@ async def text(self, encoding=None, errors='strict'):
await self.read()

if encoding is None:
encoding = self._get_encoding()
encoding = self.get_encoding()

return self._content.decode(encoding, errors=errors)

Expand All @@ -803,7 +809,7 @@ async def json(self, *, encoding=None, loads=json.loads,
return None

if encoding is None:
encoding = self._get_encoding()
encoding = self.get_encoding()

return loads(stripped.decode(encoding))

Expand Down
8 changes: 8 additions & 0 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,14 @@ Response object
A namedtuple with request URL and headers from :class:`ClientRequest`
object, :class:`aiohttp.RequestInfo` instance.

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`chardet` is used.

.. versionadded:: 3.0


ClientWebSocketResponse
Expand Down
30 changes: 25 additions & 5 deletions tests/test_client_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,12 @@ def side_effect(*args, **kwargs):
'Content-Type': 'application/json'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect
response._get_encoding = mock.Mock()
response.get_encoding = mock.Mock()

res = await response.text(encoding='cp1251')
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert not response._get_encoding.called
assert not response.get_encoding.called


async def test_text_detect_encoding(loop, session):
Expand All @@ -283,6 +283,26 @@ def side_effect(*args, **kwargs):
assert response._connection is None


async def test_text_detect_encoding_if_invalid_charset(loop, session):
response = ClientResponse('get', URL('http://def-cl-resp.org'))
response._post_init(loop, session)

def side_effect(*args, **kwargs):
fut = loop.create_future()
fut.set_result('{"тест": "пройден"}'.encode('cp1251'))
return fut

response.headers = {'Content-Type': 'text/plain;charset=invalid'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert response.get_encoding().lower() == 'windows-1251'


async def test_text_after_read(loop, session):
response = ClientResponse('get', URL('http://def-cl-resp.org'))
response._post_init(loop, session)
Expand Down Expand Up @@ -372,12 +392,12 @@ def side_effect(*args, **kwargs):
'Content-Type': 'application/json;charset=utf8'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect
response._get_encoding = mock.Mock()
response.get_encoding = mock.Mock()

res = await response.json(encoding='cp1251')
assert res == {'тест': 'пройден'}
assert response._connection is None
assert not response._get_encoding.called
assert not response.get_encoding.called


@pytest.mark.xfail
Expand All @@ -398,7 +418,7 @@ def test_get_encoding_unknown(loop, session):
response.headers = {'Content-Type': 'application/json'}
with mock.patch('aiohttp.client_reqrep.chardet') as m_chardet:
m_chardet.detect.return_value = {'encoding': None}
assert response._get_encoding() == 'utf-8'
assert response.get_encoding() == 'utf-8'


def test_raise_for_status_2xx():
Expand Down

0 comments on commit 67eb1e7

Please sign in to comment.