GOVCERT-LU · sim0nx · Oct 31, 2022 · Oct 28, 2022 · Oct 28, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.17.3]
+### Fixes
+- Fix parsing bad message-id formats #79.
+
 ## [v1.17.2]
 ### Fixes
 - When serialising RFC822 payloads, use a custom policy which has no limits on line-lenthgs as this breaks badly encoded messages.

diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py
@@ -242,6 +242,22 @@ def parse_email(self) -> dict:
         if self.msg is None:
             raise ValueError('msg is not set.')
 
+        # Loop over raw header values in order to fix them and prevent the parser from failing
+        for k, v in self.msg._headers:  # pylint: disable=protected-access
+            # workaround for bad message-id formats
+            if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
+                # try workaround for bad message-id formats
+                m = eml_parser.regexes.email_regex.search(v)
+                if m:
+                    try:
+                        self.msg.replace_header(k, m.group(1))
+                    except KeyError:
+                        # header found multiple times and previously removed
+                        self.msg.add_header(k, m.group(1))
+                else:
+                    del self.msg[k]
+                    logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')
+
         # parse and decode subject
         subject = self.msg.get('subject', '')
         headers_struc['subject'] = eml_parser.decode.decode_field(subject)
@@ -264,7 +280,7 @@ def parse_email(self) -> dict:
             logger.exception('We hit bug 27257!')
 
             _from = eml_parser.decode.workaround_bug_27257(self.msg, 'from')
-            self.msg.__delitem__('from')
+            del self.msg['from']
 
             if _from:
                 self.msg.add_header('from', _from[0])

diff --git a/samples/sample_gh_issue_79_1.eml b/samples/sample_gh_issue_79_1.eml
@@ -0,0 +1,5 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id@domain.com]>
+
+Message Body
diff --git a/samples/sample_gh_issue_79_2.eml b/samples/sample_gh_issue_79_2.eml
@@ -0,0 +1,6 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id-domain.com]>
+Message-ID: <[id@domain.com]>
+
+Message Body
diff --git a/samples/sample_gh_issue_79_3.eml b/samples/sample_gh_issue_79_3.eml
@@ -0,0 +1,5 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id-domain.com]>
+
+Message Body
diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py
@@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self):
         assert output['header']['header']['from'] == ['\n <badname@example.com>']
         assert output['header']['header']['to'] == ['\n <badname@example.com>']
         assert output['header']['header']['cc'] == ['\r <badname@example.com>']
+
+    def test_parse_email_bad_message_id(self):
+        """Parse bad message-id format."""
+        ep = eml_parser.eml_parser.EmlParser()
+        sample_1 = samples_dir / 'sample_gh_issue_79_1.eml'
+        sample_2 = samples_dir / 'sample_gh_issue_79_2.eml'
+        sample_3 = samples_dir / 'sample_gh_issue_79_3.eml'
+
+        with sample_1.open('rb') as fhdl:
+            output_1 = ep.decode_email_bytes(fhdl.read())
+
+        assert output_1['header']['header']['message-id'] == ['id@domain.com']
+
+        with sample_2.open('rb') as fhdl:
+            output_2 = ep.decode_email_bytes(fhdl.read())
+
+        assert output_2['header']['header']['message-id'] == ['id@domain.com']
+
+        with sample_3.open('rb') as fhdl:
+            output_3 = ep.decode_email_bytes(fhdl.read())
+
+        assert 'message-id' not in output_3['header']['header']