Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing bad message-id formats #79; fixes #79 #80

Merged
merged 2 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v1.17.3]
### Fixes
- Fix parsing bad message-id formats #79.

## [v1.17.2]
### Fixes
- When serialising RFC822 payloads, use a custom policy which has no limits on line-lenthgs as this breaks badly encoded messages.
Expand Down
18 changes: 17 additions & 1 deletion eml_parser/eml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,22 @@ def parse_email(self) -> dict:
if self.msg is None:
raise ValueError('msg is not set.')

# Loop over raw header values in order to fix them and prevent the parser from failing
for k, v in self.msg._headers: # pylint: disable=protected-access
# workaround for bad message-id formats
if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
# try workaround for bad message-id formats
m = eml_parser.regexes.email_regex.search(v)
if m:
try:
self.msg.replace_header(k, m.group(1))
except KeyError:
# header found multiple times and previously removed
self.msg.add_header(k, m.group(1))
else:
del self.msg[k]
logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')

# parse and decode subject
subject = self.msg.get('subject', '')
headers_struc['subject'] = eml_parser.decode.decode_field(subject)
Expand All @@ -264,7 +280,7 @@ def parse_email(self) -> dict:
logger.exception('We hit bug 27257!')

_from = eml_parser.decode.workaround_bug_27257(self.msg, 'from')
self.msg.__delitem__('from')
del self.msg['from']

if _from:
self.msg.add_header('from', _from[0])
Expand Down
5 changes: 5 additions & 0 deletions samples/sample_gh_issue_79_1.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id@domain.com]>

Message Body
6 changes: 6 additions & 0 deletions samples/sample_gh_issue_79_2.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id-domain.com]>
Message-ID: <[id@domain.com]>

Message Body
5 changes: 5 additions & 0 deletions samples/sample_gh_issue_79_3.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
From: Test <test@example.com>
To: Test <test@example.com>
Message-ID: <[id-domain.com]>

Message Body
22 changes: 22 additions & 0 deletions tests/test_emlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,3 +456,25 @@ def test_parse_email_newline_quopri(self):
assert output['header']['header']['from'] == ['\n <badname@example.com>']
assert output['header']['header']['to'] == ['\n <badname@example.com>']
assert output['header']['header']['cc'] == ['\r <badname@example.com>']

def test_parse_email_bad_message_id(self):
"""Parse bad message-id format."""
ep = eml_parser.eml_parser.EmlParser()
sample_1 = samples_dir / 'sample_gh_issue_79_1.eml'
sample_2 = samples_dir / 'sample_gh_issue_79_2.eml'
sample_3 = samples_dir / 'sample_gh_issue_79_3.eml'

with sample_1.open('rb') as fhdl:
output_1 = ep.decode_email_bytes(fhdl.read())

assert output_1['header']['header']['message-id'] == ['id@domain.com']

with sample_2.open('rb') as fhdl:
output_2 = ep.decode_email_bytes(fhdl.read())

assert output_2['header']['header']['message-id'] == ['id@domain.com']

with sample_3.open('rb') as fhdl:
output_3 = ep.decode_email_bytes(fhdl.read())

assert 'message-id' not in output_3['header']['header']