Skip to content

Commit

Permalink
custom policy (#82)
Browse files Browse the repository at this point in the history
* - Remove obsolete methods.
- Rework hashing wrapper methods.

- Add a custom e-mail parsing policy for fixhing invalid values as soon as possible.
  - Currently implemented for invalid message-id and date parsing.

* fix typo
  • Loading branch information
sim0nx authored Nov 1, 2022
1 parent 0518d5e commit b68e28e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 79 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file.

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v1.17.5]
### Changes
- Remove obsolete methods.
- Rework hashing wrapper methods.

### Added
- Add a custom e-mail parsing policy for fixhing invalid values as soon as possible.
- Currently implemented for invalid message-id and date parsing.

## [v1.17.4]
### Changes
- Renamed eml_parser.eml_parser to eml_parser.parser to make imports safer. This should not break any usage but nonetheless
Expand Down
7 changes: 4 additions & 3 deletions eml_parser/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@

logger = logging.getLogger(__name__)

default_date = '1970-01-01T00:00:00+00:00'


def decode_field(field: str) -> str:
"""Try to get the specified field using the Header module.
Expand Down Expand Up @@ -199,11 +201,10 @@ def robust_string2date(line: str) -> datetime.datetime:
datetime.datetime: Returns a datetime.datetime object.
"""
# "." -> ":" replacement is for fixing bad clients (e.g. outlook express)
default_date = '1970-01-01T00:00:00+0000'

# if the input is empty, we return a default date
if line == '':
return dateutil.parser.parse(default_date)
return datetime.datetime.fromisoformat(default_date)

try:
date_ = email.utils.parsedate_to_datetime(line)
Expand All @@ -214,7 +215,7 @@ def robust_string2date(line: str) -> datetime.datetime:
date_ = dateutil.parser.parse(line)
except (AttributeError, ValueError, OverflowError):
# Now we are facing an invalid date.
return dateutil.parser.parse(default_date)
return datetime.datetime.fromisoformat(default_date)

if date_.tzname() is None:
return date_.replace(tzinfo=datetime.timezone.utc)
Expand Down
131 changes: 59 additions & 72 deletions eml_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import binascii
import collections
import collections.abc
import datetime
import email
import email.headerregistry
import email.message
Expand All @@ -27,7 +28,6 @@
from collections import Counter
from html import unescape

import dateutil.parser
import publicsuffixlist

import eml_parser.decode
Expand Down Expand Up @@ -82,6 +82,38 @@
__license__ = 'AGPL v3+'


class CustomPolicy(email.policy.EmailPolicy):
"""Custom parsing policy based on the default policy but relaxing some checks and early fixing invalid values."""

def __init__(self) -> None:
"""Constructor."""
super().__init__(max_line_length=0, refold_source='none')

def header_fetch_parse(self, name: str, value: str) -> str:
"""Early fix parsing issues and pass the name/value to the parent header_fetch_parse method for proper parsing."""
header = name.lower()

if header == 'message-id':
if '[' in value and not eml_parser.regexes.email_regex.match(value):
# try workaround for bad message-id formats
m = eml_parser.regexes.email_regex.search(value)
if m:
value = f'<{m.group(1)}>'
else:
value = ''
logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')
elif header == 'date':
try:
value = super().header_fetch_parse(name, value)
except TypeError:
logger.warning('Error parsing date.', exc_info=True)
return eml_parser.decode.default_date

return eml_parser.decode.robust_string2date(value).isoformat()

return super().header_fetch_parse(name, value)


class EmlParser:
"""eml-parser class."""

Expand All @@ -90,7 +122,7 @@ def __init__(self,
include_raw_body: bool = False,
include_attachment_data: bool = False,
pconf: typing.Optional[dict] = None,
policy: email.policy.Policy = email.policy.default,
policy: typing.Optional[email.policy.Policy] = None,
ignore_bad_start: bool = False,
email_force_tld: bool = False,
domain_force_tld: bool = False,
Expand All @@ -110,8 +142,8 @@ def __init__(self,
returned structure. Default is False.
pconf (dict, optional): A dict with various optional configuration parameters,
e.g. whitelist IPs, whitelist e-mail addresses, etc.
policy (email.policy.Policy, optional): Policy to use when parsing e-mails.
Default = email.policy.default.
policy (CustomPolicy, optional): Policy to use when parsing e-mails.
Default = CustomPolicy.
ignore_bad_start (bool, optional): Ignore invalid file start. This has a considerable performance impact.
email_force_tld (bool, optional): Only match e-mail addresses with a TLD, i.e. exclude something like
john@doe. If enabled, it uses domain_force_tld and ip_force_routable settings
Expand All @@ -131,7 +163,7 @@ def __init__(self,
self.include_attachment_data = include_attachment_data
# If no pconf was specified, default to empty dict
self.pconf = pconf or {}
self.policy = policy
self.policy = policy or CustomPolicy()
self.ignore_bad_start = ignore_bad_start
self.email_force_tld = email_force_tld
self.domain_force_tld = domain_force_tld
Expand Down Expand Up @@ -229,22 +261,6 @@ def parse_email(self) -> dict:
if self.msg is None:
raise ValueError('msg is not set.')

# Loop over raw header values in order to fix them and prevent the parser from failing
for k, v in self.msg._headers: # type: ignore # pylint: disable=protected-access
# workaround for bad message-id formats
if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
# try workaround for bad message-id formats
m = eml_parser.regexes.email_regex.search(v)
if m:
try:
self.msg.replace_header(k, m.group(1))
except KeyError:
# header found multiple times and previously removed
self.msg.add_header(k, m.group(1))
else:
del self.msg[k]
logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')

# parse and decode subject
subject = self.msg.get('subject', '')
headers_struc['subject'] = eml_parser.decode.decode_field(subject)
Expand Down Expand Up @@ -310,18 +326,11 @@ def parse_email(self) -> dict:
# parse and decode Date
# If date field is present
if 'date' in self.msg:
try:
msg_date = self.msg.get('date')
except TypeError:
logger.warning('Error parsing date.', exc_info=True)
headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
self.msg.replace_header('date', headers_struc['date'])
else:
headers_struc['date'] = eml_parser.decode.robust_string2date(msg_date)

msg_date = self.msg.get('date')
headers_struc['date'] = datetime.datetime.fromisoformat(msg_date)
else:
# If date field is absent...
headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
headers_struc['date'] = datetime.datetime.fromisoformat(eml_parser.decode.default_date)

# mail receiver path / parse any domain, e-mail
# @TODO parse case where domain is specified but in parentheses only an IP
Expand Down Expand Up @@ -510,22 +519,22 @@ def parse_email(self) -> dict:
if list_observed_urls:
bodie['uri_hash'] = []
for element in list_observed_urls:
bodie['uri_hash'].append(self.wrap_hash_sha256(element.lower()))
bodie['uri_hash'].append(self.get_hash(element.lower(), 'sha256'))
if list_observed_email:
bodie['email_hash'] = []
for element in list_observed_email:
# Email already lowered
bodie['email_hash'].append(self.wrap_hash_sha256(element))
bodie['email_hash'].append(self.get_hash(element, 'sha256'))
if list_observed_dom:
bodie['domain_hash'] = []
# for uri in list(set(list_observed_dom)):
for element in list_observed_dom:
bodie['domain_hash'].append(self.wrap_hash_sha256(element))
bodie['domain_hash'].append(self.get_hash(element, 'sha256'))
if list_observed_ip:
bodie['ip_hash'] = []
for element in list_observed_ip:
# IP (v6) already lowered
bodie['ip_hash'].append(self.wrap_hash_sha256(element))
bodie['ip_hash'].append(self.get_hash(element, 'sha256'))

# For mail without multipart we will only get the "content....something" headers
# all other headers are in "header"
Expand Down Expand Up @@ -855,29 +864,6 @@ def headeremail2list(self, header: str) -> typing.List[str]:

return return_field

# Iterator that give all position of a given pattern (no regex)
# @FIXME: Is this still required
# Error may occur when using unicode-literals or python 3 on dirty emails
# Need to check if buffer is a clean one
# may be tested with this byte code:
# -> 00000b70 61 6c 20 32 39 b0 20 6c 75 67 6c 69 6f 20 32 30 |al 29. luglio 20|
# Should crash on "B0".
@staticmethod
def findall(pat: str, data: str) -> typing.Iterator[int]:
"""Iterator that give all position of a given pattern (no regex).
Args:
pat (str): Pattern to seek
data (str): buffer
Yields:
int: Yields the next position
"""
i = data.find(pat)
while i != -1:
yield i
i = data.find(pat, i + 1)

def get_raw_body_text(self, msg: email.message.Message, boundary: typing.Optional[str] = None) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any, typing.Optional[str]]]:
"""This method recursively retrieves all e-mail body parts and returns them as a list.
Expand Down Expand Up @@ -947,29 +933,30 @@ def get_file_hash(data: bytes) -> typing.Dict[str, str]:
dict: Returns a dict with as key the hash-type and value the calculated hash.
"""
hashalgo = ['md5', 'sha1', 'sha256', 'sha512']
hash_ = {}

for k in hashalgo:
ha = getattr(hashlib, k)
h = ha()
h.update(data)
hash_[k] = h.hexdigest()

return hash_
return {k: EmlParser.get_hash(data, k) for k in hashalgo}

@staticmethod
def wrap_hash_sha256(string: str) -> str:
"""Generate a SHA256 hash for a given string.
def get_hash(value: typing.Union[str, bytes], hash_type: str) -> str:
"""Generate a hash of type *hash_type* for a given value.
Args:
string (str): String to calculate the hash on.
value: String or bytes object to calculate the hash on.
hash_type: Hash type to use, can be any of 'md5', 'sha1', 'sha256', 'sha512'.
Returns:
str: Returns the calculated hash as a string.
"""
_string = string.encode('utf-8')
if hash_type not in ('md5', 'sha1', 'sha256', 'sha512'):
raise ValueError(f'Invalid hash type requested - "{hash_type}"')

if isinstance(value, str):
_value = value.encode('utf-8')
else:
_value = value

hash_algo = getattr(hashlib, hash_type)

return hashlib.sha256(_string).hexdigest()
return hash_algo(_value).hexdigest()

def traverse_multipart(self, msg: email.message.Message, counter: int = 0) -> typing.Dict[str, typing.Any]:
"""Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.
Expand Down
8 changes: 4 additions & 4 deletions tests/test_emlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def test_get_file_hash(self):
assert eml_parser.EmlParser.get_file_hash(raw_email) == pre_computed_hashes

def test_wrap_hash_sha256(self):
assert eml_parser.EmlParser.wrap_hash_sha256(
'www.example.com') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404'
assert eml_parser.EmlParser.get_hash(
'www.example.com', 'sha256') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404'

def test_get_uri_ondata(self):
test_urls = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Expand Down Expand Up @@ -467,12 +467,12 @@ def test_parse_email_bad_message_id(self):
with sample_1.open('rb') as fhdl:
output_1 = ep.decode_email_bytes(fhdl.read())

assert output_1['header']['header']['message-id'] == ['id@domain.com']
assert output_1['header']['header']['message-id'] == ['<id@domain.com>']

with sample_2.open('rb') as fhdl:
output_2 = ep.decode_email_bytes(fhdl.read())

assert output_2['header']['header']['message-id'] == ['id@domain.com']
assert output_2['header']['header']['message-id'] == ['<id@domain.com>']

with sample_3.open('rb') as fhdl:
output_3 = ep.decode_email_bytes(fhdl.read())
Expand Down

0 comments on commit b68e28e

Please sign in to comment.