From 1cf118b69f626fae390bfe2b16c8bbd066c64d87 Mon Sep 17 00:00:00 2001
From: Georges Toth <georges.toth@govcert.etat.lu>
Date: Fri, 28 Oct 2022 16:56:04 +0200
Subject: [PATCH 1/2] Fix parsing bad message-id formats #79; fixes #79

---
 CHANGELOG.md                     |  4 ++++
 eml_parser/eml_parser.py         | 17 ++++++++++++++++-
 samples/sample_gh_issue_79_1.eml |  5 +++++
 samples/sample_gh_issue_79_2.eml |  6 ++++++
 samples/sample_gh_issue_79_3.eml |  5 +++++
 tests/test_emlparser.py          | 22 ++++++++++++++++++++++
 6 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 samples/sample_gh_issue_79_1.eml
 create mode 100644 samples/sample_gh_issue_79_2.eml
 create mode 100644 samples/sample_gh_issue_79_3.eml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9fbf530..d9c3b9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.17.3]
+### Fixes
+- Fix parsing bad message-id formats #79.
+
 ## [v1.17.2]
 ### Fixes
 - When serialising RFC822 payloads, use a custom policy which has no limits on line-lenthgs as this breaks badly encoded messages.
diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py
index 85a4624..c1a3a6f 100644
--- a/eml_parser/eml_parser.py
+++ b/eml_parser/eml_parser.py
@@ -242,6 +242,21 @@ def parse_email(self) -> dict:
         if self.msg is None:
             raise ValueError('msg is not set.')
 
+        # Loop over raw header values in order to fix them and prevent the parser from failing
+        for k, v in self.msg._headers:  # pylint: disable=protected-access
+            # workaround for bad message-id formats
+            if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
+                # try workaround for bad message-id formats
+                if m := eml_parser.regexes.email_regex.search(v):
+                    try:
+                        self.msg.replace_header(k, m.group(1))
+                    except KeyError:
+                        # header found multiple times and previously removed
+                        self.msg.add_header(k, m.group(1))
+                else:
+                    del self.msg[k]
+                    logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')
+
         # parse and decode subject
         subject = self.msg.get('subject', '')
         headers_struc['subject'] = eml_parser.decode.decode_field(subject)
@@ -264,7 +279,7 @@ def parse_email(self) -> dict:
             logger.exception('We hit bug 27257!')
 
             _from = eml_parser.decode.workaround_bug_27257(self.msg, 'from')
-            self.msg.__delitem__('from')
+            del self.msg['from']
 
             if _from:
                 self.msg.add_header('from', _from[0])
diff --git a/samples/sample_gh_issue_79_1.eml b/samples/sample_gh_issue_79_1.eml
new file mode 100644
index 0000000..6f6c4a2
--- /dev/null
+++ b/samples/sample_gh_issue_79_1.eml
@@ -0,0 +1,5 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id@domain.com]>
+
+Message Body
\ No newline at end of file
diff --git a/samples/sample_gh_issue_79_2.eml b/samples/sample_gh_issue_79_2.eml
new file mode 100644
index 0000000..533c7ea
--- /dev/null
+++ b/samples/sample_gh_issue_79_2.eml
@@ -0,0 +1,6 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id-domain.com]>
+Message-ID: <[id@domain.com]>
+
+Message Body
\ No newline at end of file
diff --git a/samples/sample_gh_issue_79_3.eml b/samples/sample_gh_issue_79_3.eml
new file mode 100644
index 0000000..479d6e9
--- /dev/null
+++ b/samples/sample_gh_issue_79_3.eml
@@ -0,0 +1,5 @@
+From: Test <test@example.com>
+To: Test <test@example.com>
+Message-ID: <[id-domain.com]>
+
+Message Body
\ No newline at end of file
diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py
index ca49378..a11e2e8 100644
--- a/tests/test_emlparser.py
+++ b/tests/test_emlparser.py
@@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self):
         assert output['header']['header']['from'] == ['\n <badname@example.com>']
         assert output['header']['header']['to'] == ['\n <badname@example.com>']
         assert output['header']['header']['cc'] == ['\r <badname@example.com>']
+
+    def test_parse_email_bad_message_id(self):
+        """Parse bad message-id format."""
+        ep = eml_parser.eml_parser.EmlParser()
+        sample_1 = samples_dir / 'sample_gh_issue_79_1.eml'
+        sample_2 = samples_dir / 'sample_gh_issue_79_2.eml'
+        sample_3 = samples_dir / 'sample_gh_issue_79_3.eml'
+
+        with sample_1.open('rb') as fhdl:
+            output_1 = ep.decode_email_bytes(fhdl.read())
+
+        assert output_1['header']['header']['message-id'] == ['id@domain.com']
+
+        with sample_2.open('rb') as fhdl:
+            output_2 = ep.decode_email_bytes(fhdl.read())
+
+        assert output_2['header']['header']['message-id'] == ['id@domain.com']
+
+        with sample_3.open('rb') as fhdl:
+            output_3 = ep.decode_email_bytes(fhdl.read())
+
+        assert 'message-id' not in output_3['header']['header']

From 97619c454115c11154ed481782cf0297ae5c2cc8 Mon Sep 17 00:00:00 2001
From: Georges Toth <georges.toth@govcert.etat.lu>
Date: Fri, 28 Oct 2022 22:35:27 +0200
Subject: [PATCH 2/2] fix py 3.7 incompatibility

---
 eml_parser/eml_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/eml_parser/eml_parser.py b/eml_parser/eml_parser.py
index c1a3a6f..5f43c1f 100644
--- a/eml_parser/eml_parser.py
+++ b/eml_parser/eml_parser.py
@@ -247,7 +247,8 @@ def parse_email(self) -> dict:
             # workaround for bad message-id formats
             if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
                 # try workaround for bad message-id formats
-                if m := eml_parser.regexes.email_regex.search(v):
+                m = eml_parser.regexes.email_regex.search(v)
+                if m:
                     try:
                         self.msg.replace_header(k, m.group(1))
                     except KeyError: