From 221a02b2ad8a5f4f213654969388b811c20e44a4 Mon Sep 17 00:00:00 2001 From: Georges Toth Date: Fri, 28 Oct 2022 16:56:04 +0200 Subject: [PATCH] Fix parsing bad message-id formats #79; fixes #79 --- CHANGELOG.md | 4 ++++ eml_parser/eml_parser.py | 17 ++++++++++++++++- samples/sample_gh_issue_79_1.eml | 5 +++++ samples/sample_gh_issue_79_2.eml | 6 ++++++ samples/sample_gh_issue_79_3.eml | 5 +++++ tests/test_emlparser.py | 22 ++++++++++++++++++++++ 6 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 samples/sample_gh_issue_79_1.eml create mode 100644 samples/sample_gh_issue_79_2.eml create mode 100644 samples/sample_gh_issue_79_3.eml diff --git a/CHANGELOG.md b/CHANGELOG.md index 4517545..5dd1219 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.17.2] +### Fixes +- Fix parsing bad message-id formats #79. + ## [v1.17.1] ### Fixes - Fix issue #76 "If a CR or LF is found in a malformed email address header fields (From/To/etc.), the ValueError breaks the parsing." (@malvidin, @cccs-rs) diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py index d248b61..b81bb90 100644 --- a/eml_parser/eml_parser.py +++ b/eml_parser/eml_parser.py @@ -242,6 +242,21 @@ def parse_email(self) -> dict: if self.msg is None: raise ValueError('msg is not set.') + # Loop over raw header values in order to fix them and prevent the parser from failing + for k, v in self.msg._headers: # pylint: disable=protected-access + # workaround for bad message-id formats + if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v): + # try workaround for bad message-id formats + if m := eml_parser.regexes.email_regex.search(v): + try: + self.msg.replace_header(k, m.group(1)) + except KeyError: + # header found multiple times and previously removed + self.msg.add_header(k, m.group(1)) + else: + del self.msg[k] + logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.') + # parse and decode subject subject = self.msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) @@ -264,7 +279,7 @@ def parse_email(self) -> dict: logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(self.msg, 'from') - self.msg.__delitem__('from') + del self.msg['from'] if _from: self.msg.add_header('from', _from[0]) diff --git a/samples/sample_gh_issue_79_1.eml b/samples/sample_gh_issue_79_1.eml new file mode 100644 index 0000000..6f6c4a2 --- /dev/null +++ b/samples/sample_gh_issue_79_1.eml @@ -0,0 +1,5 @@ +From: Test +To: Test +Message-ID: <[id@domain.com]> + +Message Body \ No newline at end of file diff --git a/samples/sample_gh_issue_79_2.eml b/samples/sample_gh_issue_79_2.eml new file mode 100644 index 0000000..533c7ea --- /dev/null +++ b/samples/sample_gh_issue_79_2.eml @@ -0,0 +1,6 @@ +From: Test +To: Test +Message-ID: <[id-domain.com]> +Message-ID: <[id@domain.com]> + +Message Body \ No newline at end of file diff --git a/samples/sample_gh_issue_79_3.eml b/samples/sample_gh_issue_79_3.eml new file mode 100644 index 0000000..479d6e9 --- /dev/null +++ b/samples/sample_gh_issue_79_3.eml @@ -0,0 +1,5 @@ +From: Test +To: Test +Message-ID: <[id-domain.com]> + +Message Body \ No newline at end of file diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py index ca49378..a11e2e8 100644 --- a/tests/test_emlparser.py +++ b/tests/test_emlparser.py @@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self): assert output['header']['header']['from'] == ['\n '] assert output['header']['header']['to'] == ['\n '] assert output['header']['header']['cc'] == ['\r '] + + def test_parse_email_bad_message_id(self): + """Parse bad message-id format.""" + ep = eml_parser.eml_parser.EmlParser() + sample_1 = samples_dir / 'sample_gh_issue_79_1.eml' + sample_2 = samples_dir / 'sample_gh_issue_79_2.eml' + sample_3 = samples_dir / 'sample_gh_issue_79_3.eml' + + with sample_1.open('rb') as fhdl: + output_1 = ep.decode_email_bytes(fhdl.read()) + + assert output_1['header']['header']['message-id'] == ['id@domain.com'] + + with sample_2.open('rb') as fhdl: + output_2 = ep.decode_email_bytes(fhdl.read()) + + assert output_2['header']['header']['message-id'] == ['id@domain.com'] + + with sample_3.open('rb') as fhdl: + output_3 = ep.decode_email_bytes(fhdl.read()) + + assert 'message-id' not in output_3['header']['header']