From b68e28eab9a3f563261ca296e9451682269fd42e Mon Sep 17 00:00:00 2001 From: Georges Toth Date: Tue, 1 Nov 2022 02:27:34 +0100 Subject: [PATCH] custom policy (#82) * - Remove obsolete methods. - Rework hashing wrapper methods. - Add a custom e-mail parsing policy for fixhing invalid values as soon as possible. - Currently implemented for invalid message-id and date parsing. * fix typo --- CHANGELOG.md | 9 +++ eml_parser/decode.py | 7 ++- eml_parser/parser.py | 131 ++++++++++++++++++---------------------- tests/test_emlparser.py | 8 +-- 4 files changed, 76 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a6b89a..aecea4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.17.5] +### Changes +- Remove obsolete methods. +- Rework hashing wrapper methods. + +### Added +- Add a custom e-mail parsing policy for fixhing invalid values as soon as possible. + - Currently implemented for invalid message-id and date parsing. + ## [v1.17.4] ### Changes - Renamed eml_parser.eml_parser to eml_parser.parser to make imports safer. This should not break any usage but nonetheless diff --git a/eml_parser/decode.py b/eml_parser/decode.py index 0e0fa59..57dca6f 100644 --- a/eml_parser/decode.py +++ b/eml_parser/decode.py @@ -52,6 +52,8 @@ logger = logging.getLogger(__name__) +default_date = '1970-01-01T00:00:00+00:00' + def decode_field(field: str) -> str: """Try to get the specified field using the Header module. @@ -199,11 +201,10 @@ def robust_string2date(line: str) -> datetime.datetime: datetime.datetime: Returns a datetime.datetime object. """ # "." -> ":" replacement is for fixing bad clients (e.g. outlook express) - default_date = '1970-01-01T00:00:00+0000' # if the input is empty, we return a default date if line == '': - return dateutil.parser.parse(default_date) + return datetime.datetime.fromisoformat(default_date) try: date_ = email.utils.parsedate_to_datetime(line) @@ -214,7 +215,7 @@ def robust_string2date(line: str) -> datetime.datetime: date_ = dateutil.parser.parse(line) except (AttributeError, ValueError, OverflowError): # Now we are facing an invalid date. - return dateutil.parser.parse(default_date) + return datetime.datetime.fromisoformat(default_date) if date_.tzname() is None: return date_.replace(tzinfo=datetime.timezone.utc) diff --git a/eml_parser/parser.py b/eml_parser/parser.py index c33656a..7b73536 100644 --- a/eml_parser/parser.py +++ b/eml_parser/parser.py @@ -10,6 +10,7 @@ import binascii import collections import collections.abc +import datetime import email import email.headerregistry import email.message @@ -27,7 +28,6 @@ from collections import Counter from html import unescape -import dateutil.parser import publicsuffixlist import eml_parser.decode @@ -82,6 +82,38 @@ __license__ = 'AGPL v3+' +class CustomPolicy(email.policy.EmailPolicy): + """Custom parsing policy based on the default policy but relaxing some checks and early fixing invalid values.""" + + def __init__(self) -> None: + """Constructor.""" + super().__init__(max_line_length=0, refold_source='none') + + def header_fetch_parse(self, name: str, value: str) -> str: + """Early fix parsing issues and pass the name/value to the parent header_fetch_parse method for proper parsing.""" + header = name.lower() + + if header == 'message-id': + if '[' in value and not eml_parser.regexes.email_regex.match(value): + # try workaround for bad message-id formats + m = eml_parser.regexes.email_regex.search(value) + if m: + value = f'<{m.group(1)}>' + else: + value = '' + logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.') + elif header == 'date': + try: + value = super().header_fetch_parse(name, value) + except TypeError: + logger.warning('Error parsing date.', exc_info=True) + return eml_parser.decode.default_date + + return eml_parser.decode.robust_string2date(value).isoformat() + + return super().header_fetch_parse(name, value) + + class EmlParser: """eml-parser class.""" @@ -90,7 +122,7 @@ def __init__(self, include_raw_body: bool = False, include_attachment_data: bool = False, pconf: typing.Optional[dict] = None, - policy: email.policy.Policy = email.policy.default, + policy: typing.Optional[email.policy.Policy] = None, ignore_bad_start: bool = False, email_force_tld: bool = False, domain_force_tld: bool = False, @@ -110,8 +142,8 @@ def __init__(self, returned structure. Default is False. pconf (dict, optional): A dict with various optional configuration parameters, e.g. whitelist IPs, whitelist e-mail addresses, etc. - policy (email.policy.Policy, optional): Policy to use when parsing e-mails. - Default = email.policy.default. + policy (CustomPolicy, optional): Policy to use when parsing e-mails. + Default = CustomPolicy. ignore_bad_start (bool, optional): Ignore invalid file start. This has a considerable performance impact. email_force_tld (bool, optional): Only match e-mail addresses with a TLD, i.e. exclude something like john@doe. If enabled, it uses domain_force_tld and ip_force_routable settings @@ -131,7 +163,7 @@ def __init__(self, self.include_attachment_data = include_attachment_data # If no pconf was specified, default to empty dict self.pconf = pconf or {} - self.policy = policy + self.policy = policy or CustomPolicy() self.ignore_bad_start = ignore_bad_start self.email_force_tld = email_force_tld self.domain_force_tld = domain_force_tld @@ -229,22 +261,6 @@ def parse_email(self) -> dict: if self.msg is None: raise ValueError('msg is not set.') - # Loop over raw header values in order to fix them and prevent the parser from failing - for k, v in self.msg._headers: # type: ignore # pylint: disable=protected-access - # workaround for bad message-id formats - if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v): - # try workaround for bad message-id formats - m = eml_parser.regexes.email_regex.search(v) - if m: - try: - self.msg.replace_header(k, m.group(1)) - except KeyError: - # header found multiple times and previously removed - self.msg.add_header(k, m.group(1)) - else: - del self.msg[k] - logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.') - # parse and decode subject subject = self.msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) @@ -310,18 +326,11 @@ def parse_email(self) -> dict: # parse and decode Date # If date field is present if 'date' in self.msg: - try: - msg_date = self.msg.get('date') - except TypeError: - logger.warning('Error parsing date.', exc_info=True) - headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') - self.msg.replace_header('date', headers_struc['date']) - else: - headers_struc['date'] = eml_parser.decode.robust_string2date(msg_date) - + msg_date = self.msg.get('date') + headers_struc['date'] = datetime.datetime.fromisoformat(msg_date) else: # If date field is absent... - headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') + headers_struc['date'] = datetime.datetime.fromisoformat(eml_parser.decode.default_date) # mail receiver path / parse any domain, e-mail # @TODO parse case where domain is specified but in parentheses only an IP @@ -510,22 +519,22 @@ def parse_email(self) -> dict: if list_observed_urls: bodie['uri_hash'] = [] for element in list_observed_urls: - bodie['uri_hash'].append(self.wrap_hash_sha256(element.lower())) + bodie['uri_hash'].append(self.get_hash(element.lower(), 'sha256')) if list_observed_email: bodie['email_hash'] = [] for element in list_observed_email: # Email already lowered - bodie['email_hash'].append(self.wrap_hash_sha256(element)) + bodie['email_hash'].append(self.get_hash(element, 'sha256')) if list_observed_dom: bodie['domain_hash'] = [] # for uri in list(set(list_observed_dom)): for element in list_observed_dom: - bodie['domain_hash'].append(self.wrap_hash_sha256(element)) + bodie['domain_hash'].append(self.get_hash(element, 'sha256')) if list_observed_ip: bodie['ip_hash'] = [] for element in list_observed_ip: # IP (v6) already lowered - bodie['ip_hash'].append(self.wrap_hash_sha256(element)) + bodie['ip_hash'].append(self.get_hash(element, 'sha256')) # For mail without multipart we will only get the "content....something" headers # all other headers are in "header" @@ -855,29 +864,6 @@ def headeremail2list(self, header: str) -> typing.List[str]: return return_field - # Iterator that give all position of a given pattern (no regex) - # @FIXME: Is this still required - # Error may occur when using unicode-literals or python 3 on dirty emails - # Need to check if buffer is a clean one - # may be tested with this byte code: - # -> 00000b70 61 6c 20 32 39 b0 20 6c 75 67 6c 69 6f 20 32 30 |al 29. luglio 20| - # Should crash on "B0". - @staticmethod - def findall(pat: str, data: str) -> typing.Iterator[int]: - """Iterator that give all position of a given pattern (no regex). - - Args: - pat (str): Pattern to seek - data (str): buffer - - Yields: - int: Yields the next position - """ - i = data.find(pat) - while i != -1: - yield i - i = data.find(pat, i + 1) - def get_raw_body_text(self, msg: email.message.Message, boundary: typing.Optional[str] = None) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any, typing.Optional[str]]]: """This method recursively retrieves all e-mail body parts and returns them as a list. @@ -947,29 +933,30 @@ def get_file_hash(data: bytes) -> typing.Dict[str, str]: dict: Returns a dict with as key the hash-type and value the calculated hash. """ hashalgo = ['md5', 'sha1', 'sha256', 'sha512'] - hash_ = {} - - for k in hashalgo: - ha = getattr(hashlib, k) - h = ha() - h.update(data) - hash_[k] = h.hexdigest() - - return hash_ + return {k: EmlParser.get_hash(data, k) for k in hashalgo} @staticmethod - def wrap_hash_sha256(string: str) -> str: - """Generate a SHA256 hash for a given string. + def get_hash(value: typing.Union[str, bytes], hash_type: str) -> str: + """Generate a hash of type *hash_type* for a given value. Args: - string (str): String to calculate the hash on. + value: String or bytes object to calculate the hash on. + hash_type: Hash type to use, can be any of 'md5', 'sha1', 'sha256', 'sha512'. Returns: str: Returns the calculated hash as a string. """ - _string = string.encode('utf-8') + if hash_type not in ('md5', 'sha1', 'sha256', 'sha512'): + raise ValueError(f'Invalid hash type requested - "{hash_type}"') + + if isinstance(value, str): + _value = value.encode('utf-8') + else: + _value = value + + hash_algo = getattr(hashlib, hash_type) - return hashlib.sha256(_string).hexdigest() + return hash_algo(_value).hexdigest() def traverse_multipart(self, msg: email.message.Message, counter: int = 0) -> typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py index b6c9c7e..a5d2bce 100644 --- a/tests/test_emlparser.py +++ b/tests/test_emlparser.py @@ -97,8 +97,8 @@ def test_get_file_hash(self): assert eml_parser.EmlParser.get_file_hash(raw_email) == pre_computed_hashes def test_wrap_hash_sha256(self): - assert eml_parser.EmlParser.wrap_hash_sha256( - 'www.example.com') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404' + assert eml_parser.EmlParser.get_hash( + 'www.example.com', 'sha256') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404' def test_get_uri_ondata(self): test_urls = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit. @@ -467,12 +467,12 @@ def test_parse_email_bad_message_id(self): with sample_1.open('rb') as fhdl: output_1 = ep.decode_email_bytes(fhdl.read()) - assert output_1['header']['header']['message-id'] == ['id@domain.com'] + assert output_1['header']['header']['message-id'] == [''] with sample_2.open('rb') as fhdl: output_2 = ep.decode_email_bytes(fhdl.read()) - assert output_2['header']['header']['message-id'] == ['id@domain.com'] + assert output_2['header']['header']['message-id'] == [''] with sample_3.open('rb') as fhdl: output_3 = ep.decode_email_bytes(fhdl.read())