From b68e28eab9a3f563261ca296e9451682269fd42e Mon Sep 17 00:00:00 2001
From: Georges Toth <georges.toth@govcert.etat.lu>
Date: Tue, 1 Nov 2022 02:27:34 +0100
Subject: [PATCH] custom policy (#82)

* - Remove obsolete methods.
- Rework hashing wrapper methods.

- Add a custom e-mail parsing policy for fixhing invalid values as soon as possible.
  - Currently implemented for invalid message-id and date parsing.

* fix typo
---
 CHANGELOG.md            |   9 +++
 eml_parser/decode.py    |   7 ++-
 eml_parser/parser.py    | 131 ++++++++++++++++++----------------------
 tests/test_emlparser.py |   8 +--
 4 files changed, 76 insertions(+), 79 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a6b89a..aecea4d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.17.5]
+### Changes
+- Remove obsolete methods.
+- Rework hashing wrapper methods.
+
+### Added
+- Add a custom e-mail parsing policy for fixhing invalid values as soon as possible.
+  - Currently implemented for invalid message-id and date parsing.
+
 ## [v1.17.4]
 ### Changes
 - Renamed eml_parser.eml_parser to eml_parser.parser to make imports safer. This should not break any usage but nonetheless
diff --git a/eml_parser/decode.py b/eml_parser/decode.py
index 0e0fa59..57dca6f 100644
--- a/eml_parser/decode.py
+++ b/eml_parser/decode.py
@@ -52,6 +52,8 @@
 
 logger = logging.getLogger(__name__)
 
+default_date = '1970-01-01T00:00:00+00:00'
+
 
 def decode_field(field: str) -> str:
     """Try to get the specified field using the Header module.
@@ -199,11 +201,10 @@ def robust_string2date(line: str) -> datetime.datetime:
         datetime.datetime: Returns a datetime.datetime object.
     """
     # "." -> ":" replacement is for fixing bad clients (e.g. outlook express)
-    default_date = '1970-01-01T00:00:00+0000'
 
     # if the input is empty, we return a default date
     if line == '':
-        return dateutil.parser.parse(default_date)
+        return datetime.datetime.fromisoformat(default_date)
 
     try:
         date_ = email.utils.parsedate_to_datetime(line)
@@ -214,7 +215,7 @@ def robust_string2date(line: str) -> datetime.datetime:
             date_ = dateutil.parser.parse(line)
         except (AttributeError, ValueError, OverflowError):
             # Now we are facing an invalid date.
-            return dateutil.parser.parse(default_date)
+            return datetime.datetime.fromisoformat(default_date)
 
     if date_.tzname() is None:
         return date_.replace(tzinfo=datetime.timezone.utc)
diff --git a/eml_parser/parser.py b/eml_parser/parser.py
index c33656a..7b73536 100644
--- a/eml_parser/parser.py
+++ b/eml_parser/parser.py
@@ -10,6 +10,7 @@
 import binascii
 import collections
 import collections.abc
+import datetime
 import email
 import email.headerregistry
 import email.message
@@ -27,7 +28,6 @@
 from collections import Counter
 from html import unescape
 
-import dateutil.parser
 import publicsuffixlist
 
 import eml_parser.decode
@@ -82,6 +82,38 @@
 __license__ = 'AGPL v3+'
 
 
+class CustomPolicy(email.policy.EmailPolicy):
+    """Custom parsing policy based on the default policy but relaxing some checks and early fixing invalid values."""
+
+    def __init__(self) -> None:
+        """Constructor."""
+        super().__init__(max_line_length=0, refold_source='none')
+
+    def header_fetch_parse(self, name: str, value: str) -> str:
+        """Early fix parsing issues and pass the name/value to the parent header_fetch_parse method for proper parsing."""
+        header = name.lower()
+
+        if header == 'message-id':
+            if '[' in value and not eml_parser.regexes.email_regex.match(value):
+                # try workaround for bad message-id formats
+                m = eml_parser.regexes.email_regex.search(value)
+                if m:
+                    value = f'<{m.group(1)}>'
+                else:
+                    value = ''
+                    logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')
+        elif header == 'date':
+            try:
+                value = super().header_fetch_parse(name, value)
+            except TypeError:
+                logger.warning('Error parsing date.', exc_info=True)
+                return eml_parser.decode.default_date
+
+            return eml_parser.decode.robust_string2date(value).isoformat()
+
+        return super().header_fetch_parse(name, value)
+
+
 class EmlParser:
     """eml-parser class."""
 
@@ -90,7 +122,7 @@ def __init__(self,
                  include_raw_body: bool = False,
                  include_attachment_data: bool = False,
                  pconf: typing.Optional[dict] = None,
-                 policy: email.policy.Policy = email.policy.default,
+                 policy: typing.Optional[email.policy.Policy] = None,
                  ignore_bad_start: bool = False,
                  email_force_tld: bool = False,
                  domain_force_tld: bool = False,
@@ -110,8 +142,8 @@ def __init__(self,
                                                       returned structure. Default is False.
             pconf (dict, optional): A dict with various optional configuration parameters,
                                     e.g. whitelist IPs, whitelist e-mail addresses, etc.
-            policy (email.policy.Policy, optional): Policy to use when parsing e-mails.
-                                                    Default = email.policy.default.
+            policy (CustomPolicy, optional): Policy to use when parsing e-mails.
+                                                    Default = CustomPolicy.
             ignore_bad_start (bool, optional): Ignore invalid file start. This has a considerable performance impact.
             email_force_tld (bool, optional): Only match e-mail addresses with a TLD, i.e. exclude something like
                                               john@doe. If enabled, it uses domain_force_tld and ip_force_routable settings
@@ -131,7 +163,7 @@ def __init__(self,
         self.include_attachment_data = include_attachment_data
         # If no pconf was specified, default to empty dict
         self.pconf = pconf or {}
-        self.policy = policy
+        self.policy = policy or CustomPolicy()
         self.ignore_bad_start = ignore_bad_start
         self.email_force_tld = email_force_tld
         self.domain_force_tld = domain_force_tld
@@ -229,22 +261,6 @@ def parse_email(self) -> dict:
         if self.msg is None:
             raise ValueError('msg is not set.')
 
-        # Loop over raw header values in order to fix them and prevent the parser from failing
-        for k, v in self.msg._headers:  # type: ignore # pylint: disable=protected-access
-            # workaround for bad message-id formats
-            if k.lower() == 'message-id' and not eml_parser.regexes.email_regex.match(v):
-                # try workaround for bad message-id formats
-                m = eml_parser.regexes.email_regex.search(v)
-                if m:
-                    try:
-                        self.msg.replace_header(k, m.group(1))
-                    except KeyError:
-                        # header found multiple times and previously removed
-                        self.msg.add_header(k, m.group(1))
-                else:
-                    del self.msg[k]
-                    logger.warning('Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.')
-
         # parse and decode subject
         subject = self.msg.get('subject', '')
         headers_struc['subject'] = eml_parser.decode.decode_field(subject)
@@ -310,18 +326,11 @@ def parse_email(self) -> dict:
         # parse and decode Date
         # If date field is present
         if 'date' in self.msg:
-            try:
-                msg_date = self.msg.get('date')
-            except TypeError:
-                logger.warning('Error parsing date.', exc_info=True)
-                headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
-                self.msg.replace_header('date', headers_struc['date'])
-            else:
-                headers_struc['date'] = eml_parser.decode.robust_string2date(msg_date)
-
+            msg_date = self.msg.get('date')
+            headers_struc['date'] = datetime.datetime.fromisoformat(msg_date)
         else:
             # If date field is absent...
-            headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
+            headers_struc['date'] = datetime.datetime.fromisoformat(eml_parser.decode.default_date)
 
         # mail receiver path / parse any domain, e-mail
         # @TODO parse case where domain is specified but in parentheses only an IP
@@ -510,22 +519,22 @@ def parse_email(self) -> dict:
                 if list_observed_urls:
                     bodie['uri_hash'] = []
                     for element in list_observed_urls:
-                        bodie['uri_hash'].append(self.wrap_hash_sha256(element.lower()))
+                        bodie['uri_hash'].append(self.get_hash(element.lower(), 'sha256'))
                 if list_observed_email:
                     bodie['email_hash'] = []
                     for element in list_observed_email:
                         # Email already lowered
-                        bodie['email_hash'].append(self.wrap_hash_sha256(element))
+                        bodie['email_hash'].append(self.get_hash(element, 'sha256'))
                 if list_observed_dom:
                     bodie['domain_hash'] = []
                     # for uri in list(set(list_observed_dom)):
                     for element in list_observed_dom:
-                        bodie['domain_hash'].append(self.wrap_hash_sha256(element))
+                        bodie['domain_hash'].append(self.get_hash(element, 'sha256'))
                 if list_observed_ip:
                     bodie['ip_hash'] = []
                     for element in list_observed_ip:
                         # IP (v6) already lowered
-                        bodie['ip_hash'].append(self.wrap_hash_sha256(element))
+                        bodie['ip_hash'].append(self.get_hash(element, 'sha256'))
 
             # For mail without multipart we will only get the "content....something" headers
             # all other headers are in "header"
@@ -855,29 +864,6 @@ def headeremail2list(self, header: str) -> typing.List[str]:
 
         return return_field
 
-    # Iterator that give all position of a given pattern (no regex)
-    # @FIXME: Is this still required
-    # Error may occur when using unicode-literals or python 3 on dirty emails
-    # Need to check if buffer is a clean one
-    # may be tested with this byte code:
-    # -> 00000b70  61 6c 20 32 39 b0 20 6c  75 67 6c 69 6f 20 32 30  |al 29. luglio 20|
-    # Should crash on "B0".
-    @staticmethod
-    def findall(pat: str, data: str) -> typing.Iterator[int]:
-        """Iterator that give all position of a given pattern (no regex).
-
-        Args:
-            pat (str): Pattern to seek
-            data (str): buffer
-
-        Yields:
-            int: Yields the next position
-        """
-        i = data.find(pat)
-        while i != -1:
-            yield i
-            i = data.find(pat, i + 1)
-
     def get_raw_body_text(self, msg: email.message.Message, boundary: typing.Optional[str] = None) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any, typing.Optional[str]]]:
         """This method recursively retrieves all e-mail body parts and returns them as a list.
 
@@ -947,29 +933,30 @@ def get_file_hash(data: bytes) -> typing.Dict[str, str]:
           dict: Returns a dict with as key the hash-type and value the calculated hash.
         """
         hashalgo = ['md5', 'sha1', 'sha256', 'sha512']
-        hash_ = {}
-
-        for k in hashalgo:
-            ha = getattr(hashlib, k)
-            h = ha()
-            h.update(data)
-            hash_[k] = h.hexdigest()
-
-        return hash_
+        return {k: EmlParser.get_hash(data, k) for k in hashalgo}
 
     @staticmethod
-    def wrap_hash_sha256(string: str) -> str:
-        """Generate a SHA256 hash for a given string.
+    def get_hash(value: typing.Union[str, bytes], hash_type: str) -> str:
+        """Generate a hash of type *hash_type* for a given value.
 
         Args:
-            string (str): String to calculate the hash on.
+            value: String or bytes object to calculate the hash on.
+            hash_type: Hash type to use, can be any of 'md5', 'sha1', 'sha256', 'sha512'.
 
         Returns:
             str: Returns the calculated hash as a string.
         """
-        _string = string.encode('utf-8')
+        if hash_type not in ('md5', 'sha1', 'sha256', 'sha512'):
+            raise ValueError(f'Invalid hash type requested - "{hash_type}"')
+
+        if isinstance(value, str):
+            _value = value.encode('utf-8')
+        else:
+            _value = value
+
+        hash_algo = getattr(hashlib, hash_type)
 
-        return hashlib.sha256(_string).hexdigest()
+        return hash_algo(_value).hexdigest()
 
     def traverse_multipart(self, msg: email.message.Message, counter: int = 0) -> typing.Dict[str, typing.Any]:
         """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.
diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py
index b6c9c7e..a5d2bce 100644
--- a/tests/test_emlparser.py
+++ b/tests/test_emlparser.py
@@ -97,8 +97,8 @@ def test_get_file_hash(self):
         assert eml_parser.EmlParser.get_file_hash(raw_email) == pre_computed_hashes
 
     def test_wrap_hash_sha256(self):
-        assert eml_parser.EmlParser.wrap_hash_sha256(
-            'www.example.com') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404'
+        assert eml_parser.EmlParser.get_hash(
+            'www.example.com', 'sha256') == '80fc0fb9266db7b83f85850fa0e6548b6d70ee68c8b5b412f1deea6ebdef0404'
 
     def test_get_uri_ondata(self):
         test_urls = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit.
@@ -467,12 +467,12 @@ def test_parse_email_bad_message_id(self):
         with sample_1.open('rb') as fhdl:
             output_1 = ep.decode_email_bytes(fhdl.read())
 
-        assert output_1['header']['header']['message-id'] == ['id@domain.com']
+        assert output_1['header']['header']['message-id'] == ['<id@domain.com>']
 
         with sample_2.open('rb') as fhdl:
             output_2 = ep.decode_email_bytes(fhdl.read())
 
-        assert output_2['header']['header']['message-id'] == ['id@domain.com']
+        assert output_2['header']['header']['message-id'] == ['<id@domain.com>']
 
         with sample_3.open('rb') as fhdl:
             output_3 = ep.decode_email_bytes(fhdl.read())