Skip to content

Commit

Permalink
Fix parsing bad message-id formats #79; fixes #79
Browse files Browse the repository at this point in the history
  • Loading branch information
sim0nx committed Oct 28, 2022
1 parent 014ed69 commit 221a02b
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v1.17.2]
### Fixes
- Fix parsing bad message-id formats #79.

## [v1.17.1]
### Fixes
- Fix issue #76 "If a CR or LF is found in a malformed email address header fields (From/To/etc.), the ValueError breaks the parsing." (@malvidin, @cccs-rs)
Expand Down
17 changes: 16 additions & 1 deletion eml_parser/eml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,21 @@ def parse_email(self) -> dict:
if self.msg is None:
raise ValueError('msg is not set.')

# Loop over raw header values in order to fix them and prevent the parser from failing
for k, v in self.msg._headers: # pylint: disable=protected-access
# workaround for bad message-id formats
if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
# try workaround for bad message-id formats
if m := eml_parser.regexes.email_regex.search(v):
try:
self.msg.replace_header(k, m.group(1))
except KeyError:
# header found multiple times and previously removed
self.msg.add_header(k, m.group(1))
else:
del self.msg[k]
logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')

# parse and decode subject
subject = self.msg.get('subject', '')
headers_struc['subject'] = eml_parser.decode.decode_field(subject)
Expand All @@ -264,7 +279,7 @@ def parse_email(self) -> dict:
logger.exception('We hit bug 27257!')

_from = eml_parser.decode.workaround_bug_27257(self.msg, 'from')
self.msg.__delitem__('from')
del self.msg['from']

if _from:
self.msg.add_header('from', _from[0])
Expand Down
5 changes: 5 additions & 0 deletions samples/sample_gh_issue_79_1.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id@domain.com]>

Message Body
6 changes: 6 additions & 0 deletions samples/sample_gh_issue_79_2.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id-domain.com]>
Message-ID: <[id@domain.com]>

Message Body
5 changes: 5 additions & 0 deletions samples/sample_gh_issue_79_3.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id-domain.com]>

Message Body
22 changes: 22 additions & 0 deletions tests/test_emlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self):
assert output['header']['header']['from'] == ['\n <badname@example.com>']
assert output['header']['header']['to'] == ['\n <badname@example.com>']
assert output['header']['header']['cc'] == ['\r <badname@example.com>']

def test_parse_email_bad_message_id(self):
"""Parse bad message-id format."""
ep = eml_parser.eml_parser.EmlParser()
sample_1 = samples_dir / 'sample_gh_issue_79_1.eml'
sample_2 = samples_dir / 'sample_gh_issue_79_2.eml'
sample_3 = samples_dir / 'sample_gh_issue_79_3.eml'

with sample_1.open('rb') as fhdl:
output_1 = ep.decode_email_bytes(fhdl.read())

assert output_1['header']['header']['message-id'] == ['id@domain.com']

with sample_2.open('rb') as fhdl:
output_2 = ep.decode_email_bytes(fhdl.read())

assert output_2['header']['header']['message-id'] == ['id@domain.com']

with sample_3.open('rb') as fhdl:
output_3 = ep.decode_email_bytes(fhdl.read())

assert 'message-id' not in output_3['header']['header']

0 comments on commit 221a02b

Please sign in to comment.