From 1cf118b69f626fae390bfe2b16c8bbd066c64d87 Mon Sep 17 00:00:00 2001 From: Georges Toth Date: Fri, 28 Oct 2022 16:56:04 +0200 Subject: [PATCH 1/2] Fix parsing bad message-id formats #79; fixes #79 --- CHANGELOG.md | 4 ++++ eml_parser/eml_parser.py | 17 ++++++++++++++++- samples/sample_gh_issue_79_1.eml | 5 +++++ samples/sample_gh_issue_79_2.eml | 6 ++++++ samples/sample_gh_issue_79_3.eml | 5 +++++ tests/test_emlparser.py | 22 ++++++++++++++++++++++ 6 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 samples/sample_gh_issue_79_1.eml create mode 100644 samples/sample_gh_issue_79_2.eml create mode 100644 samples/sample_gh_issue_79_3.eml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fbf530..d9c3b9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.17.3] +### Fixes +- Fix parsing bad message-id formats #79. + ## [v1.17.2] ### Fixes - When serialising RFC822 payloads, use a custom policy which has no limits on line-lenthgs as this breaks badly encoded messages. diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py index 85a4624..c1a3a6f 100644 --- a/eml_parser/eml_parser.py +++ b/eml_parser/eml_parser.py @@ -242,6 +242,21 @@ def parse_email(self) -> dict: if self.msg is None: raise ValueError('msg is not set.') + # Loop over raw header values in order to fix them and prevent the parser from failing + for k, v in self.msg._headers: # pylint: disable=protected-access + # workaround for bad message-id formats + if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v): + # try workaround for bad message-id formats + if m := eml_parser.regexes.email_regex.search(v): + try: + self.msg.replace_header(k, m.group(1)) + except KeyError: + # header found multiple times and previously removed + self.msg.add_header(k, m.group(1)) + else: + del self.msg[k] + logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.') + # parse and decode subject subject = self.msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) @@ -264,7 +279,7 @@ def parse_email(self) -> dict: logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(self.msg, 'from') - self.msg.__delitem__('from') + del self.msg['from'] if _from: self.msg.add_header('from', _from[0]) diff --git a/samples/sample_gh_issue_79_1.eml b/samples/sample_gh_issue_79_1.eml new file mode 100644 index 0000000..6f6c4a2 --- /dev/null +++ b/samples/sample_gh_issue_79_1.eml @@ -0,0 +1,5 @@ +From: Test +To: Test +Message-ID: <[id@domain.com]> + +Message Body \ No newline at end of file diff --git a/samples/sample_gh_issue_79_2.eml b/samples/sample_gh_issue_79_2.eml new file mode 100644 index 0000000..533c7ea --- /dev/null +++ b/samples/sample_gh_issue_79_2.eml @@ -0,0 +1,6 @@ +From: Test +To: Test +Message-ID: <[id-domain.com]> +Message-ID: <[id@domain.com]> + +Message Body \ No newline at end of file diff --git a/samples/sample_gh_issue_79_3.eml b/samples/sample_gh_issue_79_3.eml new file mode 100644 index 0000000..479d6e9 --- /dev/null +++ b/samples/sample_gh_issue_79_3.eml @@ -0,0 +1,5 @@ +From: Test +To: Test +Message-ID: <[id-domain.com]> + +Message Body \ No newline at end of file diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py index ca49378..a11e2e8 100644 --- a/tests/test_emlparser.py +++ b/tests/test_emlparser.py @@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self): assert output['header']['header']['from'] == ['\n '] assert output['header']['header']['to'] == ['\n '] assert output['header']['header']['cc'] == ['\r '] + + def test_parse_email_bad_message_id(self): + """Parse bad message-id format.""" + ep = eml_parser.eml_parser.EmlParser() + sample_1 = samples_dir / 'sample_gh_issue_79_1.eml' + sample_2 = samples_dir / 'sample_gh_issue_79_2.eml' + sample_3 = samples_dir / 'sample_gh_issue_79_3.eml' + + with sample_1.open('rb') as fhdl: + output_1 = ep.decode_email_bytes(fhdl.read()) + + assert output_1['header']['header']['message-id'] == ['id@domain.com'] + + with sample_2.open('rb') as fhdl: + output_2 = ep.decode_email_bytes(fhdl.read()) + + assert output_2['header']['header']['message-id'] == ['id@domain.com'] + + with sample_3.open('rb') as fhdl: + output_3 = ep.decode_email_bytes(fhdl.read()) + + assert 'message-id' not in output_3['header']['header'] From 97619c454115c11154ed481782cf0297ae5c2cc8 Mon Sep 17 00:00:00 2001 From: Georges Toth Date: Fri, 28 Oct 2022 22:35:27 +0200 Subject: [PATCH 2/2] fix py 3.7 incompatibility --- eml_parser/eml_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py index c1a3a6f..5f43c1f 100644 --- a/eml_parser/eml_parser.py +++ b/eml_parser/eml_parser.py @@ -247,7 +247,8 @@ def parse_email(self) -> dict: # workaround for bad message-id formats if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v): # try workaround for bad message-id formats - if m := eml_parser.regexes.email_regex.search(v): + m = eml_parser.regexes.email_regex.search(v) + if m: try: self.msg.replace_header(k, m.group(1)) except KeyError: