From 12f5d25ce43a8123073280f06811308589a9eedc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Fri, 12 Apr 2024 12:21:39 +0200
Subject: [PATCH 1/6] =?UTF-8?q?Zast=C4=85pienie=20biblioteki=20'talon',=20?=
 =?UTF-8?q?jedynie=20niezb=C4=99dnymi=20funkcjami=20(z=20w/w=20biblioteki)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                             |   2 +
 extract_raw_content/__init__.py        |   0
 extract_raw_content/constants.py       | 154 ++++++++++++++
 extract_raw_content/html.py            | 280 +++++++++++++++++++++++++
 extract_raw_content/html_quotations.py | 224 ++++++++++++++++++++
 extract_raw_content/text.py            | 131 ++++++++++++
 extract_raw_content/utils.py           |  84 ++++++++
 mail_parser.py                         |  13 +-
 requirements.txt                       |  37 +++-
 9 files changed, 914 insertions(+), 11 deletions(-)
 create mode 100644 extract_raw_content/__init__.py
 create mode 100644 extract_raw_content/constants.py
 create mode 100644 extract_raw_content/html.py
 create mode 100644 extract_raw_content/html_quotations.py
 create mode 100644 extract_raw_content/text.py
 create mode 100644 extract_raw_content/utils.py

diff --git a/.gitignore b/.gitignore
index 4922fa4..7cd7ba0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 __pycache__/
 .env
 .idea
+.vscode/
+venv/
diff --git a/extract_raw_content/__init__.py b/extract_raw_content/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py
new file mode 100644
index 0000000..f490ed7
--- /dev/null
+++ b/extract_raw_content/constants.py
@@ -0,0 +1,154 @@
+import regex as re
+
+MAX_LINES_COUNT = 1000
+SPLITTER_MAX_LINES = 6
+_MAX_TAGS_COUNT = 419
+_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
+_HARDBREAKS = ['br', 'hr', 'tr']
+_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
+
+QUOT_PATTERN = re.compile('^>+ ?')
+RE_PARENTHESIS_LINK = re.compile("\(https?://")
+RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
+RE_DELIMITER = re.compile('\r?\n')
+RE_LINK = re.compile('<(http://[^>]*)>')
+RE_ON_DATE_SMB_WROTE = re.compile(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+        # Beginning of the line
+        u'|'.join((
+            # English
+            'On',
+            # French
+            'Le',
+            # Polish
+            'W dniu',
+            # Dutch
+            'Op',
+            # German
+            'Am',
+            # Norwegian
+            u'På',
+            # Swedish, Danish
+            'Den',
+            # Vietnamese
+            u'Vào',
+        )),
+        # Date and sender separator
+        u'|'.join((
+            # most languages separate date and sender address by comma
+            ',',
+            # polish date and sender address separator
+            u'użytkownik'
+        )),
+        # Ending of the line
+        u'|'.join((
+            # English
+            'wrote', 'sent',
+            # French
+            u'a écrit',
+            # Polish
+            u'napisał',
+            # Dutch
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb',
+            # Norwegian, Swedish
+            'skrev',
+            # Vietnamese
+            u'đã viết',
+        ))
+    ))
+RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
+    u'|'.join((
+        # English
+        'Original Message', 'Reply Message',
+        # German
+        u'Ursprüngliche Nachricht', 'Antwort Nachricht',
+        # Danish
+        'Oprindelig meddelelse',
+    ))), re.I)
+RE_ON_DATE_WROTE_SMB = re.compile(
+    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+        # Beginning of the line
+        u'|'.join((
+        	'Op',
+        	#German
+        	'Am'
+        )),
+        # Ending of the line
+        u'|'.join((
+            # Dutch
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb'
+        ))
+    )
+    )
+
+RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
+    u'|'.join((
+        # "From" in different languages.
+        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
+        # "Date" in different languages.
+        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
+    ))), re.I)
+RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
+    u'|'.join((
+        # English
+        'wrote',
+    ))), re.I)
+RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
+RE_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            s
+            |
+            (?:me*){2,}
+        )
+
+        # quotation lines could be marked as splitter or text, etc.
+        .*
+
+        # but we expect it to end with a quotation marker line
+        me*
+    )
+
+    # after quotations should be text only or nothing at all
+    [te]*$
+    ''', re.VERBOSE)
+RE_EMPTY_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            (?:se*)+
+            |
+            (?:me*){2,}
+        )
+    )
+    e*
+    ''', re.VERBOSE)
+
+SPLITTER_PATTERNS = [
+    RE_ORIGINAL_MESSAGE,
+    RE_ON_DATE_SMB_WROTE,
+    RE_ON_DATE_WROTE_SMB,
+    RE_FROM_COLON_OR_DATE_COLON,
+    # 02.04.2012 14:20 пользователь "bob@example.com" <
+    # bob@xxx.mailgun.org> написал:
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
+    # 2014-10-17 11:28 GMT+03:00 Bob <
+    # bob@example.com>:
+    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
+    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
+    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
+               '( \S+){3,6}@\S+:'),
+    # Sent from Samsung MobileName <address@example.com> wrote:
+    re.compile('Sent from Samsung .*@.*> wrote'),
+    RE_ANDROID_WROTE,
+    RE_POLYMAIL
+    ]
+
diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py
new file mode 100644
index 0000000..10ccd93
--- /dev/null
+++ b/extract_raw_content/html.py
@@ -0,0 +1,280 @@
+import regex as re
+import six
+import html5lib
+from lxml import html
+from lxml.html import html5parser
+from lxml.cssselect import CSSSelector
+from copy import deepcopy
+
+from . import constants as const
+from . import utils
+from . import html_quotations
+
+
+def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
+    """Run regexes against message's marked lines to strip quotations.
+
+    Return only last message lines.
+    >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
+    ['Hello']
+
+    Also returns return_flags.
+    return_flags = [were_lines_deleted, first_deleted_line,
+                    last_deleted_line]
+    """
+    markers = ''.join(markers)
+    # if there are no splitter there should be no markers
+    if 's' not in markers and not re.search('(me*){3}', markers):
+        markers = markers.replace('m', 't')
+    if re.match('[te]*f', markers):
+        return_flags[:] = [False, -1, -1]
+        return lines
+    # inlined reply
+    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
+    # both 't' entries should be found
+    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+        # long links could break sequence of quotation lines but they shouldn't
+        # be considered an inline reply
+        links = (
+            const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
+            const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        if not links:
+            return_flags[:] = [False, -1, -1]
+            return lines
+
+    # cut out text lines coming after splitter if there are no markers there
+    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    if quotation:
+        return_flags[:] = [True, quotation.start(), len(lines)]
+        return lines[:quotation.start()]
+
+    # handle the case with markers
+    quotation = (const.RE_QUOTATION.search(markers) or
+                 const.RE_EMPTY_QUOTATION.search(markers))
+    if quotation:
+        return_flags[:] = True, quotation.start(1), quotation.end(1)
+        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+
+    return_flags[:] = [False, -1, -1]
+    return lines
+
+
+def mark_message_lines(lines):
+    """Mark message lines with markers to distinguish quotation lines.
+
+    Markers:
+
+    * e - empty line
+    * m - line that starts with quotation marker '>'
+    * s - splitter line
+    * t - presumably lines from the last message in the conversation
+
+    >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
+    'tsem'
+    """
+    markers = ['e' for _ in lines]
+    i = 0
+    while i < len(lines):
+        if not lines[i].strip():
+            markers[i] = 'e'  # empty line
+        elif const.QUOT_PATTERN.match(lines[i]):
+            markers[i] = 'm'  # line with quotation marker
+        elif const.RE_FWD.match(lines[i]):
+            markers[i] = 'f'  # ---- Forwarded message ----
+        else:
+            # in case splitter is spread across several lines
+            splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES]))
+
+            if splitter:
+                # append as many splitter markers as lines in splitter
+                splitter_lines = splitter.group().splitlines()
+                for j in range(len(splitter_lines)):
+                    markers[i + j] = 's'
+
+                # skip splitter lines
+                i += len(splitter_lines) - 1
+            else:
+                # probably the line from the last message in the conversation
+                markers[i] = 't'
+        i += 1
+    return ''.join(markers)
+
+
+def _html5lib_parser():
+    """
+    html5lib is a pure-python library that conforms to the WHATWG HTML spec
+    and is not vulnarable to certain attacks common for XML libraries
+    """
+    return html5lib.HTMLParser(
+        # build lxml tree
+        html5lib.treebuilders.getTreeBuilder("lxml"),
+        # remove namespace value from inside lxml.html.html5paser element tag
+        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
+        # instead of "div", throwing the algo off
+        namespaceHTMLElements=False
+    )
+
+
+def _rm_excessive_newlines(s):
+    """Remove excessive newlines that often happen due to tons of divs
+    """
+    return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
+
+
+def _encode_utf8(s):
+    """Encode in 'utf-8' if unicode
+    """
+    return s.encode('utf-8') if isinstance(s, six.text_type) else s
+
+
+def html_too_big(s):
+    if isinstance(s, six.text_type):
+        s = s.encode('utf8')
+    return s.count(b'<') > const._MAX_TAGS_COUNT
+
+
+def html_document_fromstring(s):
+    """Parse html tree from string. Return None if the string can't be parsed.
+    """
+    if isinstance(s, six.text_type):
+        s = s.encode('utf8')
+    try:
+        if html_too_big(s):
+            return None
+
+        return html5parser.document_fromstring(s, parser=_html5lib_parser())
+    except Exception:
+        pass
+
+
+def html_tree_to_text(tree):
+    for style in CSSSelector('style')(tree):
+        style.getparent().remove(style)
+    for c in tree.xpath('//comment()'):
+        parent = c.getparent()
+        # comment with no parent does not impact produced text
+        if parent is None:
+            continue
+        parent.remove(c)
+    text = ""
+    for el in tree.iter():
+        el_text = (el.text or '') + (el.tail or '')
+        if len(el_text) > 1:
+            if el.tag in const._BLOCKTAGS:
+                text += "\n"
+            if el.tag == 'li':
+                text += "  * "
+            text += el_text.strip() + " "
+
+            # add href to the output
+            href = el.attrib.get('href')
+            if href:
+                text += "(%s) " % href
+        if el.tag in const._HARDBREAKS and text and not text.endswith("\n"):
+            text += "\n"
+    retval = _rm_excessive_newlines(text)
+    return _encode_utf8(retval)
+
+
+def _readable_text_empty(html_tree):
+    return not bool(html_tree_to_text(html_tree).strip())
+
+
+def _extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting necessary tags.
+    """
+    if msg_body.strip() == b'':
+        return msg_body
+
+    msg_body = msg_body.replace(b'\r\n', b'\n')
+    html_tree = html_document_fromstring(msg_body)
+
+    if html_tree is None:
+        return msg_body
+    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
+                      html_quotations.cut_zimbra_quote(html_tree) or
+                      html_quotations.cut_blockquote(html_tree) or
+                      html_quotations.cut_microsoft_quote(html_tree) or
+                      html_quotations.cut_by_id(html_tree) or
+                      html_quotations.cut_from_block(html_tree)
+                      )
+    html_tree_copy = deepcopy(html_tree)
+
+    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
+    quotation_checkpoints = [False] * number_of_checkpoints
+    plain_text = html_tree_to_text(html_tree)
+    plain_text = utils.preprocess(plain_text, '\n', content_type='text/html')
+    lines = plain_text.splitlines()
+    # Don't process too long messages
+    if len(lines) > const.MAX_LINES_COUNT:
+        return msg_body
+    # Collect checkpoints on each line
+    line_checkpoints = [
+        [int(i[4:-4])  # Only checkpoint number
+            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
+        for line in lines]
+    # Remove checkpoints
+    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
+             for line in lines]
+
+    # Use plain text quotation extracting algorithm
+    markers = mark_message_lines(lines)
+    return_flags = []
+    process_marked_lines(lines, markers, return_flags)
+    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if not lines_were_deleted and not cut_quotations:
+        return msg_body
+    if lines_were_deleted:
+        #collect checkpoints from deleted lines
+        for i in range(first_deleted, last_deleted):
+            for checkpoint in line_checkpoints[i]:
+                quotation_checkpoints[checkpoint] = True
+        # Remove tags with quotation checkpoints
+        html_quotations.delete_quotation_tags(
+            html_tree_copy, 0, quotation_checkpoints
+        )
+    if _readable_text_empty(html_tree_copy):
+        return msg_body
+    return html.tostring(html_tree_copy)
+
+def extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting necessary tags.
+
+    Returns a unicode string.
+    """
+    if isinstance(msg_body, six.text_type):
+        msg_body = msg_body.encode('utf8')
+    elif not isinstance(msg_body, bytes):
+        msg_body = msg_body.encode('ascii')
+
+    result = _extract_from_html(msg_body)
+    if isinstance(result, bytes):
+        result = result.decode('utf8')
+    return result
diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py
new file mode 100644
index 0000000..08ce578
--- /dev/null
+++ b/extract_raw_content/html_quotations.py
@@ -0,0 +1,224 @@
+import regex as re
+from lxml.cssselect import CSSSelector
+
+
+CHECKPOINT_PREFIX = '#!%!'
+CHECKPOINT_SUFFIX = '!%!#'
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
+# HTML quote indicators (tag ids)
+QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
+
+
+def cssselect(expr, tree):
+    return CSSSelector(expr)(tree)
+
+
+def add_checkpoint(html_note, counter):
+    """Recursively adds checkpoints to html tree.
+    """
+    if html_note.text:
+        html_note.text = (html_note.text + CHECKPOINT_PREFIX +
+                          str(counter) + CHECKPOINT_SUFFIX)
+    else:
+        html_note.text = (CHECKPOINT_PREFIX + str(counter) +
+                          CHECKPOINT_SUFFIX)
+    counter += 1
+
+    for child in html_note.iterchildren():
+        counter = add_checkpoint(child, counter)
+
+    if html_note.tail:
+        html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
+                          str(counter) + CHECKPOINT_SUFFIX)
+    else:
+        html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
+                          CHECKPOINT_SUFFIX)
+    counter += 1
+    return counter
+
+
+def delete_quotation_tags(html_note, counter, quotation_checkpoints):
+    """Deletes tags with quotation checkpoints from html tree.
+    """
+    tag_in_quotation = True
+
+    if quotation_checkpoints[counter]:
+        html_note.text = ''
+    else:
+        tag_in_quotation = False
+    counter += 1
+
+    quotation_children = []  # Children tags which are in quotation.
+    for child in html_note.iterchildren():
+        counter, child_tag_in_quotation = delete_quotation_tags(
+            child, counter,
+            quotation_checkpoints
+        )
+        if child_tag_in_quotation:
+            quotation_children.append(child)
+
+    if quotation_checkpoints[counter]:
+        html_note.tail = ''
+    else:
+        tag_in_quotation = False
+    counter += 1
+
+    if tag_in_quotation:
+        return counter, tag_in_quotation
+    else:
+        # Remove quotation children.
+        for child in quotation_children:
+            html_note.remove(child)
+        return counter, tag_in_quotation
+
+
+def cut_gmail_quote(html_message):
+    ''' Cuts the outermost block element with class gmail_quote. '''
+    gmail_quote = cssselect('div.gmail_quote', html_message)
+    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
+        gmail_quote[0].getparent().remove(gmail_quote[0])
+        return True
+
+
+def cut_microsoft_quote(html_message):
+    ''' Cuts splitter block and all following blocks. '''
+    splitter = html_message.xpath(
+        #outlook 2007, 2010 (international)
+        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
+        "padding:3.0pt 0cm 0cm 0cm']|"
+        #outlook 2007, 2010 (american)
+        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
+        "padding:3.0pt 0in 0in 0in']|"
+        #outlook 2013 (international)
+        "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
+        "padding:3.0pt 0cm 0cm 0cm']|"
+        #outlook 2013 (american)
+        "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
+        "padding:3.0pt 0in 0in 0in']|"
+        #windows mail
+        "//div[@style='padding-top: 5px; "
+        "border-top-color: rgb(229, 229, 229); "
+        "border-top-width: 1px; border-top-style: solid;']"
+    )
+
+    if splitter:
+        splitter = splitter[0]
+        #outlook 2010
+        if splitter == splitter.getparent().getchildren()[0]:
+            splitter = splitter.getparent()
+    else:
+        #outlook 2003
+        splitter = html_message.xpath(
+            "//div"
+            "/div[@class='MsoNormal' and @align='center' "
+            "and @style='text-align:center']"
+            "/font"
+            "/span"
+            "/hr[@size='3' and @width='100%' and @align='center' "
+            "and @tabindex='-1']"
+        )
+        if len(splitter):
+            splitter = splitter[0]
+            splitter = splitter.getparent().getparent()
+            splitter = splitter.getparent().getparent()
+
+    if len(splitter):
+        parent = splitter.getparent()
+        after_splitter = splitter.getnext()
+        while after_splitter is not None:
+            parent.remove(after_splitter)
+            after_splitter = splitter.getnext()
+        parent.remove(splitter)
+        return True
+
+    return False
+
+
+def cut_by_id(html_message):
+    found = False
+    for quote_id in QUOTE_IDS:
+        quote = cssselect('#{}'.format(quote_id), html_message)
+        if quote:
+            found = True
+            quote[0].getparent().remove(quote[0])
+    return found
+
+
+def cut_blockquote(html_message):
+    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    quote = html_message.xpath(
+        '(.//blockquote)'
+        '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
+        '[last()]')
+
+    if quote:
+        quote = quote[0]
+        quote.getparent().remove(quote)
+        return True
+
+
+def cut_from_block(html_message):
+    """Cuts div tag which wraps block starting with "From:"."""
+    # handle the case when From: block is enclosed in some tag
+    block = html_message.xpath(
+        ("//*[starts-with(mg:text_content(), 'From:')]|"
+         "//*[starts-with(mg:text_content(), 'Date:')]"))
+
+    if block:
+        block = block[-1]
+        parent_div = None
+        while block.getparent() is not None:
+            if block.tag == 'div':
+                parent_div = block
+                break
+            block = block.getparent()
+        if parent_div is not None:
+            maybe_body = parent_div.getparent()
+            # In cases where removing this enclosing div will remove all
+            # content, we should assume the quote is not enclosed in a tag.
+            parent_div_is_all_content = (
+                maybe_body is not None and maybe_body.tag == 'body' and
+                len(maybe_body.getchildren()) == 1)
+
+            if not parent_div_is_all_content:
+                parent = block.getparent()
+                next_sibling = block.getnext()
+
+                # remove all tags after found From block
+                # (From block and quoted message are in separate divs)
+                while next_sibling is not None:
+                    parent.remove(block)
+                    block = next_sibling
+                    next_sibling = block.getnext()
+
+                # remove the last sibling (or the
+                # From block if no siblings)
+                if block is not None:
+                    parent.remove(block)
+
+                return True
+        else:
+            return False
+    # handle the case when From: block goes right after e.g. <hr>
+    # and not enclosed in some tag
+    block = html_message.xpath(
+        ("//*[starts-with(mg:tail(), 'From:')]|"
+         "//*[starts-with(mg:tail(), 'Date:')]"))
+    if block:
+        block = block[0]
+
+        if RE_FWD.match(block.getparent().text or ''):
+            return False
+        
+        while(block.getnext() is not None):
+            block.getparent().remove(block.getnext())
+        block.getparent().remove(block)
+        return True
+
+
+def cut_zimbra_quote(html_message):
+    zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
+    if zDivider:
+        zDivider[0].getparent().remove(zDivider[0])
+        return True
diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py
new file mode 100644
index 0000000..6023368
--- /dev/null
+++ b/extract_raw_content/text.py
@@ -0,0 +1,131 @@
+import regex as re
+
+from . import constants as const
+from . import utils
+
+
+def get_delimiter(msg_body):
+    delimiter = const.RE_DELIMITER.search(msg_body)
+    if delimiter:
+        delimiter = delimiter.group()
+    else:
+        delimiter = '\n'
+    return delimiter
+
+
+def mark_message_lines(lines):
+    """Mark message lines with markers to distinguish quotation lines.
+
+    Markers:
+
+    * e - empty line
+    * m - line that starts with quotation marker '>'
+    * s - splitter line
+    * t - presumably lines from the last message in the conversation
+
+    >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
+    'tsem'
+    """
+    markers = ['e' for _ in lines]
+    i = 0
+    while i < len(lines):
+        if not lines[i].strip():
+            markers[i] = 'e'  # empty line
+        elif const.QUOT_PATTERN.match(lines[i]):
+            markers[i] = 'm'  # line with quotation marker
+        elif const.RE_FWD.match(lines[i]):
+            markers[i] = 'f'  # ---- Forwarded message ----
+        else:
+            # in case splitter is spread across several lines
+            splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES]))
+
+            if splitter:
+                # append as many splitter markers as lines in splitter
+                splitter_lines = splitter.group().splitlines()
+                for j in range(len(splitter_lines)):
+                    markers[i + j] = 's'
+
+                # skip splitter lines
+                i += len(splitter_lines) - 1
+            else:
+                # probably the line from the last message in the conversation
+                markers[i] = 't'
+        i += 1
+
+    return ''.join(markers)
+
+
+def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
+    """Run regexes against message's marked lines to strip quotations.
+
+    Return only last message lines.
+    >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
+    ['Hello']
+
+    Also returns return_flags.
+    return_flags = [were_lines_deleted, first_deleted_line,
+                    last_deleted_line]
+    """
+    markers = ''.join(markers)
+    # if there are no splitter there should be no markers
+    if 's' not in markers and not re.search('(me*){3}', markers):
+        markers = markers.replace('m', 't')
+
+    if re.match('[te]*f', markers):
+        return_flags[:] = [False, -1, -1]
+        return lines
+
+    # inlined reply
+    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
+    # both 't' entries should be found
+    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+        # long links could break sequence of quotation lines but they shouldn't
+        # be considered an inline reply
+        links = (
+            const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
+            const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        if not links:
+            return_flags[:] = [False, -1, -1]
+            return lines
+
+    # cut out text lines coming after splitter if there are no markers there
+    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    if quotation:
+        return_flags[:] = [True, quotation.start(), len(lines)]
+        return lines[:quotation.start()]
+
+    # handle the case with markers
+    quotation = (const.RE_QUOTATION.search(markers) or
+                 const.RE_EMPTY_QUOTATION.search(markers))
+
+    if quotation:
+        return_flags[:] = True, quotation.start(1), quotation.end(1)
+        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+
+    return_flags[:] = [False, -1, -1]
+    return lines
+
+
+def postprocess(msg_body):
+    """Make up for changes done at preprocessing message.
+
+    Replace link brackets back to '<' and '>'.
+    """
+    return re.sub(const.RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
+
+
+def extract_from_plain(msg_body):
+    """Extracts a non quoted message from provided plain text."""
+    stripped_text = msg_body
+
+    delimiter = get_delimiter(msg_body)
+    msg_body = utils.preprocess(msg_body, delimiter)
+    # don't process too long messages
+    lines = msg_body.splitlines()[:const.MAX_LINES_COUNT]
+    markers = mark_message_lines(lines)
+    lines = process_marked_lines(lines, markers)
+
+    # concatenate lines, change links back, strip and return
+    msg_body = delimiter.join(lines)
+    msg_body = postprocess(msg_body)
+    return msg_body
diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py
new file mode 100644
index 0000000..cabc18a
--- /dev/null
+++ b/extract_raw_content/utils.py
@@ -0,0 +1,84 @@
+import regex as re
+from lxml import etree
+
+from . import constants as const
+
+
+def text_content(context):
+    """XPath Extension function to return a node text content."""
+    return context.context_node.xpath("string()").strip()
+
+
+def tail(context):
+    """XPath Extension function to return a node tail text."""
+    return context.context_node.tail or ''
+
+
+def register_xpath_extensions():
+    ns = etree.FunctionNamespace("http://mailgun.net")
+    ns.prefix = 'mg'
+    ns['text_content'] = text_content
+    ns['tail'] = tail
+    
+    
+def _replace_link_brackets(msg_body):
+    """
+    Normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    so that '>' closing the link couldn't be mistakenly taken for quotation
+    marker.
+
+    Converts msg_body into a unicode
+    """
+    if isinstance(msg_body, bytes):
+        msg_body = msg_body.decode('utf8')
+
+    def link_wrapper(link):
+        newline_index = msg_body[:link.start()].rfind("\n")
+        if msg_body[newline_index + 1] == ">":
+            return link.group()
+        else:
+            return "@@%s@@" % link.group(1)
+    msg_body = re.sub(const.RE_LINK, link_wrapper, msg_body)
+    return msg_body
+
+
+def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
+    """
+    Splits line in two if splitter pattern preceded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern.
+    """
+    def splitter_wrapper(splitter):
+        """Wraps splitter with new line"""
+        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
+            return '%s%s' % (delimiter, splitter.group())
+        else:
+            return splitter.group()
+
+    if content_type == 'text/plain':
+        msg_body = re.sub(const.RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
+    return msg_body
+
+
+def preprocess(msg_body, delimiter, content_type='text/plain'):
+    """Prepares msg_body for being stripped.
+
+    Replaces link brackets so that they couldn't be taken for quotation marker.
+    Splits line in two if splitter pattern preceded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern).
+
+    Converts msg_body into a unicode.
+    """
+    msg_body = _replace_link_brackets(msg_body)
+    msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
+    return msg_body
+
+
+def is_splitter(line):
+    '''
+    Returns Matcher object if provided string is a splitter and
+    None otherwise.
+    '''
+    for pattern in const.SPLITTER_PATTERNS:
+        matcher = re.match(pattern, line)
+        if matcher:
+            return matcher
diff --git a/mail_parser.py b/mail_parser.py
index d1fbcd3..b63a97a 100644
--- a/mail_parser.py
+++ b/mail_parser.py
@@ -6,12 +6,13 @@
 import re
 import uuid
 from io import BytesIO
-
 import mailparser
-import talon
 from html2text import html2text
 
-talon.init()
+from extract_raw_content.text import extract_from_plain
+from extract_raw_content.html import extract_from_html
+from extract_raw_content.utils import register_xpath_extensions
+
 
 decoder_map = {
     "base64": base64.b64decode,
@@ -26,6 +27,8 @@
 EML_MIME = "message/rfc822"
 BINARY_MIME = "application/octet-stream"
 
+register_xpath_extensions()
+
 
 def get_text(mail):
     raw_content, html_content, plain_content, html_quote, plain_quote = (
@@ -38,13 +41,13 @@ def get_text(mail):
 
     if mail.text_html:
         raw_content = "".join(mail.text_html).replace("\r\n", "\n")
-        html_content = talon.quotations.extract_from_html(raw_content)
+        html_content = extract_from_html(raw_content)
         html_quote = raw_content.replace(html_content, "")
         plain_content = html2text(html_content)
 
     if mail.text_plain or not plain_content:
         raw_content = "".join(mail.text_plain)
-        plain_content = talon.quotations.extract_from_plain(raw_content)
+        plain_content = extract_from_plain(raw_content)
         plain_quote = raw_content.replace(plain_content, "")
 
     # 'content' item holds plain_content and 'quote' item holds plain_quote
diff --git a/requirements.txt b/requirements.txt
index 3b7f9a8..cf56c79 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,34 @@
-requests==2.31.0
-mail-parser==3.15.0
+asttokens==2.4.1
+certifi==2024.2.2
+charset-normalizer==3.3.2
+cssselect==1.2.0
+decorator==5.1.1
+exceptiongroup==1.2.0
+executing==2.0.1
 html2text==2024.2.26
-# pypi talon to be replaced - last update 2017
-talon==1.4.4
-# forked talon - scikit version does not match training data
-# git+https://github.com/PiotrIw/talon.git@master
+html5lib==1.1
+idna==3.7
 ipdb==0.13.13
+ipython==8.23.0
+jedi==0.19.1
+lxml==5.2.1
+mail-parser==3.15.0
+matplotlib-inline==0.1.6
+parso==0.8.4
+pexpect==4.9.0
+prompt-toolkit==3.0.43
+ptyprocess==0.7.0
+pure-eval==0.2.2
+Pygments==2.17.2
+regex==2023.12.25
+requests==2.31.0
 sentry-sdk==1.43.0
+simplejson==3.19.2
+six==1.16.0
+stack-data==0.6.3
+tomli==2.0.1
+traitlets==5.14.2
+typing_extensions==4.11.0
+urllib3==2.2.1
+wcwidth==0.2.13
+webencodings==0.5.1

From 0ea7e7d01ce02e5d359d5ddab60ebc3b36a959d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Sat, 20 Apr 2024 19:26:39 +0200
Subject: [PATCH 2/6] Added lint-all changes

---
 extract_raw_content/constants.py       | 244 +++++++++++++++----------
 extract_raw_content/html.py            | 137 +++++++-------
 extract_raw_content/html_quotations.py | 125 +++++++------
 extract_raw_content/text.py            |  55 +++---
 extract_raw_content/utils.py           |  34 ++--
 mail_parser.py                         |   4 +-
 6 files changed, 332 insertions(+), 267 deletions(-)

diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py
index f490ed7..3e07a94 100644
--- a/extract_raw_content/constants.py
+++ b/extract_raw_content/constants.py
@@ -1,106 +1,152 @@
-import regex as re
+import re
+
 
 MAX_LINES_COUNT = 1000
 SPLITTER_MAX_LINES = 6
 _MAX_TAGS_COUNT = 419
-_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
-_HARDBREAKS = ['br', 'hr', 'tr']
+_BLOCKTAGS = ["div", "p", "ul", "li", "h1", "h2", "h3"]
+_HARDBREAKS = ["br", "hr", "tr"]
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
 
-QUOT_PATTERN = re.compile('^>+ ?')
-RE_PARENTHESIS_LINK = re.compile("\(https?://")
-RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
+QUOT_PATTERN = re.compile("^>+ ?")
+RE_PARENTHESIS_LINK = re.compile(r"\(https?://")
+RE_NORMALIZED_LINK = re.compile("@@(http://[^>@]*)@@")
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
-RE_DELIMITER = re.compile('\r?\n')
-RE_LINK = re.compile('<(http://[^>]*)>')
+RE_DELIMITER = re.compile("\r?\n")
+RE_LINK = re.compile("<(http://[^>]*)>")
 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    "(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)".format(
         # Beginning of the line
-        u'|'.join((
-            # English
-            'On',
-            # French
-            'Le',
-            # Polish
-            'W dniu',
-            # Dutch
-            'Op',
-            # German
-            'Am',
-            # Norwegian
-            u'På',
-            # Swedish, Danish
-            'Den',
-            # Vietnamese
-            u'Vào',
-        )),
+        "|".join(
+            (
+                # English
+                "On",
+                # French
+                "Le",
+                # Polish
+                "W dniu",
+                # Dutch
+                "Op",
+                # German
+                "Am",
+                # Norwegian
+                "På",
+                # Swedish, Danish
+                "Den",
+                # Vietnamese
+                "Vào",
+            )
+        ),
         # Date and sender separator
-        u'|'.join((
-            # most languages separate date and sender address by comma
-            ',',
-            # polish date and sender address separator
-            u'użytkownik'
-        )),
+        "|".join(
+            (
+                # most languages separate date and sender address by comma
+                ",",
+                # polish date and sender address separator
+                "użytkownik",
+            )
+        ),
         # Ending of the line
-        u'|'.join((
-            # English
-            'wrote', 'sent',
-            # French
-            u'a écrit',
-            # Polish
-            u'napisał',
-            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb',
-            # Norwegian, Swedish
-            'skrev',
-            # Vietnamese
-            u'đã viết',
-        ))
-    ))
-RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
-    u'|'.join((
-        # English
-        'Original Message', 'Reply Message',
-        # German
-        u'Ursprüngliche Nachricht', 'Antwort Nachricht',
-        # Danish
-        'Oprindelig meddelelse',
-    ))), re.I)
+        "|".join(
+            (
+                # English
+                "wrote",
+                "sent",
+                # French
+                "a écrit",
+                # Polish
+                "napisał",
+                # Dutch
+                "schreef",
+                "verzond",
+                "geschreven",
+                # German
+                "schrieb",
+                # Norwegian, Swedish
+                "skrev",
+                # Vietnamese
+                "đã viết",
+            )
+        ),
+    )
+)
+RE_ORIGINAL_MESSAGE = re.compile(
+    r"[\s]*[-]+[ ]*({})[ ]*[-]+".format(
+        "|".join(
+            (
+                # English
+                "Original Message",
+                "Reply Message",
+                # German
+                "Ursprüngliche Nachricht",
+                "Antwort Nachricht",
+                # Danish
+                "Oprindelig meddelelse",
+            )
+        )
+    ),
+    re.I,
+)
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    "(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)".format(
         # Beginning of the line
-        u'|'.join((
-        	'Op',
-        	#German
-        	'Am'
-        )),
+        "|".join(
+            (
+                "Op",
+                # German
+                "Am",
+            )
+        ),
         # Ending of the line
-        u'|'.join((
-            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb'
-        ))
-    )
+        "|".join(
+            (
+                # Dutch
+                "schreef",
+                "verzond",
+                "geschreven",
+                # German
+                "schrieb",
+            )
+        ),
     )
+)
 
-RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
-    u'|'.join((
-        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
-        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
-    ))), re.I)
-RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
-    u'|'.join((
-        # English
-        'wrote',
-    ))), re.I)
-RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
+RE_FROM_COLON_OR_DATE_COLON = re.compile(
+    "(_+\r?\n)?[\\s]*(:?[*]?{})[\\s]?:[*]?.*".format(
+        "|".join(
+            (
+                # "From" in different languages.
+                "From",
+                "Van",
+                "De",
+                "Von",
+                "Fra",
+                "Från",
+                # "Date" in different languages.
+                "Date",
+                "Datum",
+                "Envoyé",
+                "Skickat",
+                "Sendt",
+            )
+        )
+    ),
+    re.I,
+)
+RE_ANDROID_WROTE = re.compile(
+    r"[\s]*[-]+.*({})[ ]*[-]+".format(
+        "|".join(
+            (
+                # English
+                "wrote",
+            )
+        )
+    ),
+    re.I,
+)
+RE_POLYMAIL = re.compile(r"On.*\s{2}<\smailto:.*\s> wrote:", re.I)
 RE_QUOTATION = re.compile(
-    r'''
+    r"""
     (
         # quotation border: splitter line or a number of quotation marker lines
         (?:
@@ -118,9 +164,11 @@
 
     # after quotations should be text only or nothing at all
     [te]*$
-    ''', re.VERBOSE)
+    """,
+    re.VERBOSE,
+)
 RE_EMPTY_QUOTATION = re.compile(
-    r'''
+    r"""
     (
         # quotation border: splitter line or a number of quotation marker lines
         (?:
@@ -130,7 +178,9 @@
         )
     )
     e*
-    ''', re.VERBOSE)
+    """,
+    re.VERBOSE,
+)
 
 SPLITTER_PATTERNS = [
     RE_ORIGINAL_MESSAGE,
@@ -139,16 +189,16 @@
     RE_FROM_COLON_OR_DATE_COLON,
     # 02.04.2012 14:20 пользователь "bob@example.com" <
     # bob@xxx.mailgun.org> написал:
-    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
+    re.compile(r"(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
     # 2014-10-17 11:28 GMT+03:00 Bob <
     # bob@example.com>:
-    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
+    re.compile(r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
-    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
-               '( \S+){3,6}@\S+:'),
+    re.compile(
+        r"\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?" r"( \S+){3,6}@\S+:"
+    ),
     # Sent from Samsung MobileName <address@example.com> wrote:
-    re.compile('Sent from Samsung .*@.*> wrote'),
+    re.compile("Sent from Samsung .*@.*> wrote"),
     RE_ANDROID_WROTE,
-    RE_POLYMAIL
-    ]
-
+    RE_POLYMAIL,
+]
diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py
index 10ccd93..bc1f4b6 100644
--- a/extract_raw_content/html.py
+++ b/extract_raw_content/html.py
@@ -1,14 +1,13 @@
-import regex as re
-import six
+from copy import deepcopy
+
 import html5lib
+import re
 from lxml import html
-from lxml.html import html5parser
 from lxml.cssselect import CSSSelector
-from copy import deepcopy
+from lxml.html import html5parser
 
 from . import constants as const
-from . import utils
-from . import html_quotations
+from . import html_quotations, utils
 
 
 def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
@@ -22,38 +21,39 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
     return_flags = [were_lines_deleted, first_deleted_line,
                     last_deleted_line]
     """
-    markers = ''.join(markers)
+    markers = "".join(markers)
     # if there are no splitter there should be no markers
-    if 's' not in markers and not re.search('(me*){3}', markers):
-        markers = markers.replace('m', 't')
-    if re.match('[te]*f', markers):
+    if "s" not in markers and not re.search("(me*){3}", markers):
+        markers = markers.replace("m", "t")
+    if re.match("[te]*f", markers):
         return_flags[:] = [False, -1, -1]
         return lines
     # inlined reply
     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
     # both 't' entries should be found
-    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+    for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers):
         # long links could break sequence of quotation lines but they shouldn't
         # be considered an inline reply
-        links = (
-            const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
-            const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        links = const.RE_PARENTHESIS_LINK.search(
+            lines[inline_reply.start() - 1]
+        ) or const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())
         if not links:
             return_flags[:] = [False, -1, -1]
             return lines
 
     # cut out text lines coming after splitter if there are no markers there
-    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    quotation = re.search("(se*)+((t|f)+e*)+", markers)
     if quotation:
         return_flags[:] = [True, quotation.start(), len(lines)]
-        return lines[:quotation.start()]
+        return lines[: quotation.start()]
 
     # handle the case with markers
-    quotation = (const.RE_QUOTATION.search(markers) or
-                 const.RE_EMPTY_QUOTATION.search(markers))
+    quotation = const.RE_QUOTATION.search(markers) or const.RE_EMPTY_QUOTATION.search(
+        markers
+    )
     if quotation:
         return_flags[:] = True, quotation.start(1), quotation.end(1)
-        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+        return lines[: quotation.start(1)] + lines[quotation.end(1) :]
 
     return_flags[:] = [False, -1, -1]
     return lines
@@ -72,32 +72,34 @@ def mark_message_lines(lines):
     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
     'tsem'
     """
-    markers = ['e' for _ in lines]
+    markers = ["e" for _ in lines]
     i = 0
     while i < len(lines):
         if not lines[i].strip():
-            markers[i] = 'e'  # empty line
+            markers[i] = "e"  # empty line
         elif const.QUOT_PATTERN.match(lines[i]):
-            markers[i] = 'm'  # line with quotation marker
+            markers[i] = "m"  # line with quotation marker
         elif const.RE_FWD.match(lines[i]):
-            markers[i] = 'f'  # ---- Forwarded message ----
+            markers[i] = "f"  # ---- Forwarded message ----
         else:
             # in case splitter is spread across several lines
-            splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES]))
+            splitter = utils.is_splitter(
+                "\n".join(lines[i : i + const.SPLITTER_MAX_LINES])
+            )
 
             if splitter:
                 # append as many splitter markers as lines in splitter
                 splitter_lines = splitter.group().splitlines()
                 for j in range(len(splitter_lines)):
-                    markers[i + j] = 's'
+                    markers[i + j] = "s"
 
                 # skip splitter lines
                 i += len(splitter_lines) - 1
             else:
                 # probably the line from the last message in the conversation
-                markers[i] = 't'
+                markers[i] = "t"
         i += 1
-    return ''.join(markers)
+    return "".join(markers)
 
 
 def _html5lib_parser():
@@ -111,33 +113,30 @@ def _html5lib_parser():
         # remove namespace value from inside lxml.html.html5paser element tag
         # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
         # instead of "div", throwing the algo off
-        namespaceHTMLElements=False
+        namespaceHTMLElements=False,
     )
 
 
 def _rm_excessive_newlines(s):
-    """Remove excessive newlines that often happen due to tons of divs
-    """
+    """Remove excessive newlines that often happen due to tons of divs"""
     return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
 
 
 def _encode_utf8(s):
-    """Encode in 'utf-8' if unicode
-    """
-    return s.encode('utf-8') if isinstance(s, six.text_type) else s
+    """Encode in 'utf-8' if unicode"""
+    return s.encode("utf-8") if isinstance(s, str) else s
 
 
 def html_too_big(s):
-    if isinstance(s, six.text_type):
-        s = s.encode('utf8')
-    return s.count(b'<') > const._MAX_TAGS_COUNT
+    if isinstance(s, str):
+        s = s.encode("utf8")
+    return s.count(b"<") > const._MAX_TAGS_COUNT
 
 
 def html_document_fromstring(s):
-    """Parse html tree from string. Return None if the string can't be parsed.
-    """
-    if isinstance(s, six.text_type):
-        s = s.encode('utf8')
+    """Parse html tree from string. Return None if the string can't be parsed."""
+    if isinstance(s, str):
+        s = s.encode("utf8")
     try:
         if html_too_big(s):
             return None
@@ -148,9 +147,9 @@ def html_document_fromstring(s):
 
 
 def html_tree_to_text(tree):
-    for style in CSSSelector('style')(tree):
+    for style in CSSSelector("style")(tree):
         style.getparent().remove(style)
-    for c in tree.xpath('//comment()'):
+    for c in tree.xpath("//comment()"):
         parent = c.getparent()
         # comment with no parent does not impact produced text
         if parent is None:
@@ -158,16 +157,16 @@ def html_tree_to_text(tree):
         parent.remove(c)
     text = ""
     for el in tree.iter():
-        el_text = (el.text or '') + (el.tail or '')
+        el_text = (el.text or "") + (el.tail or "")
         if len(el_text) > 1:
             if el.tag in const._BLOCKTAGS:
                 text += "\n"
-            if el.tag == 'li':
+            if el.tag == "li":
                 text += "  * "
             text += el_text.strip() + " "
 
             # add href to the output
-            href = el.attrib.get('href')
+            href = el.attrib.get("href")
             if href:
                 text += "(%s) " % href
         if el.tag in const._HARDBREAKS and text and not text.endswith("\n"):
@@ -196,39 +195,42 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if msg_body.strip() == b'':
+    if msg_body.strip() == b"":
         return msg_body
 
-    msg_body = msg_body.replace(b'\r\n', b'\n')
+    msg_body = msg_body.replace(b"\r\n", b"\n")
     html_tree = html_document_fromstring(msg_body)
 
     if html_tree is None:
         return msg_body
-    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
-                      html_quotations.cut_zimbra_quote(html_tree) or
-                      html_quotations.cut_blockquote(html_tree) or
-                      html_quotations.cut_microsoft_quote(html_tree) or
-                      html_quotations.cut_by_id(html_tree) or
-                      html_quotations.cut_from_block(html_tree)
-                      )
+    cut_quotations = (
+        html_quotations.cut_gmail_quote(html_tree)
+        or html_quotations.cut_zimbra_quote(html_tree)
+        or html_quotations.cut_blockquote(html_tree)
+        or html_quotations.cut_microsoft_quote(html_tree)
+        or html_quotations.cut_by_id(html_tree)
+        or html_quotations.cut_from_block(html_tree)
+    )
     html_tree_copy = deepcopy(html_tree)
 
     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
     quotation_checkpoints = [False] * number_of_checkpoints
     plain_text = html_tree_to_text(html_tree)
-    plain_text = utils.preprocess(plain_text, '\n', content_type='text/html')
+    plain_text = utils.preprocess(plain_text, "\n", content_type="text/html")
     lines = plain_text.splitlines()
     # Don't process too long messages
     if len(lines) > const.MAX_LINES_COUNT:
         return msg_body
     # Collect checkpoints on each line
     line_checkpoints = [
-        [int(i[4:-4])  # Only checkpoint number
-            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
-        for line in lines]
+        [
+            int(i[4:-4])  # Only checkpoint number
+            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
+        ]
+        for line in lines
+    ]
     # Remove checkpoints
-    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
-             for line in lines]
+    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, "", line) for line in lines]
 
     # Use plain text quotation extracting algorithm
     markers = mark_message_lines(lines)
@@ -239,18 +241,17 @@ def _extract_from_html(msg_body):
     if not lines_were_deleted and not cut_quotations:
         return msg_body
     if lines_were_deleted:
-        #collect checkpoints from deleted lines
+        # collect checkpoints from deleted lines
         for i in range(first_deleted, last_deleted):
             for checkpoint in line_checkpoints[i]:
                 quotation_checkpoints[checkpoint] = True
         # Remove tags with quotation checkpoints
-        html_quotations.delete_quotation_tags(
-            html_tree_copy, 0, quotation_checkpoints
-        )
+        html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints)
     if _readable_text_empty(html_tree_copy):
         return msg_body
     return html.tostring(html_tree_copy)
 
+
 def extract_from_html(msg_body):
     """
     Extract not quoted message from provided html message body
@@ -269,12 +270,12 @@ def extract_from_html(msg_body):
 
     Returns a unicode string.
     """
-    if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
+    if isinstance(msg_body, str):
+        msg_body = msg_body.encode("utf8")
     elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+        msg_body = msg_body.encode("ascii")
 
     result = _extract_from_html(msg_body)
     if isinstance(result, bytes):
-        result = result.decode('utf8')
+        result = result.decode("utf8")
     return result
diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py
index 08ce578..b361661 100644
--- a/extract_raw_content/html_quotations.py
+++ b/extract_raw_content/html_quotations.py
@@ -1,12 +1,11 @@
-import regex as re
+import re
 from lxml.cssselect import CSSSelector
 
-
-CHECKPOINT_PREFIX = '#!%!'
-CHECKPOINT_SUFFIX = '!%!#'
-CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
+CHECKPOINT_PREFIX = "#!%!"
+CHECKPOINT_SUFFIX = "!%!#"
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + r"\d+" + CHECKPOINT_SUFFIX)
 # HTML quote indicators (tag ids)
-QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+QUOTE_IDS = ["OLK_SRC_BODY_SECTION"]
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 
 
@@ -15,36 +14,34 @@ def cssselect(expr, tree):
 
 
 def add_checkpoint(html_note, counter):
-    """Recursively adds checkpoints to html tree.
-    """
+    """Recursively adds checkpoints to html tree."""
     if html_note.text:
-        html_note.text = (html_note.text + CHECKPOINT_PREFIX +
-                          str(counter) + CHECKPOINT_SUFFIX)
+        html_note.text = (
+            html_note.text + CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX
+        )
     else:
-        html_note.text = (CHECKPOINT_PREFIX + str(counter) +
-                          CHECKPOINT_SUFFIX)
+        html_note.text = CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX
     counter += 1
 
     for child in html_note.iterchildren():
         counter = add_checkpoint(child, counter)
 
     if html_note.tail:
-        html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
-                          str(counter) + CHECKPOINT_SUFFIX)
+        html_note.tail = (
+            html_note.tail + CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX
+        )
     else:
-        html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
-                          CHECKPOINT_SUFFIX)
+        html_note.tail = CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX
     counter += 1
     return counter
 
 
 def delete_quotation_tags(html_note, counter, quotation_checkpoints):
-    """Deletes tags with quotation checkpoints from html tree.
-    """
+    """Deletes tags with quotation checkpoints from html tree."""
     tag_in_quotation = True
 
     if quotation_checkpoints[counter]:
-        html_note.text = ''
+        html_note.text = ""
     else:
         tag_in_quotation = False
     counter += 1
@@ -52,14 +49,13 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
     quotation_children = []  # Children tags which are in quotation.
     for child in html_note.iterchildren():
         counter, child_tag_in_quotation = delete_quotation_tags(
-            child, counter,
-            quotation_checkpoints
+            child, counter, quotation_checkpoints
         )
         if child_tag_in_quotation:
             quotation_children.append(child)
 
     if quotation_checkpoints[counter]:
-        html_note.tail = ''
+        html_note.tail = ""
     else:
         tag_in_quotation = False
     counter += 1
@@ -74,29 +70,31 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
 
 
 def cut_gmail_quote(html_message):
-    ''' Cuts the outermost block element with class gmail_quote. '''
-    gmail_quote = cssselect('div.gmail_quote', html_message)
-    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
+    """Cuts the outermost block element with class gmail_quote."""
+    gmail_quote = cssselect("div.gmail_quote", html_message)
+    if gmail_quote and (
+        gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)
+    ):
         gmail_quote[0].getparent().remove(gmail_quote[0])
         return True
 
 
 def cut_microsoft_quote(html_message):
-    ''' Cuts splitter block and all following blocks. '''
+    """Cuts splitter block and all following blocks."""
     splitter = html_message.xpath(
-        #outlook 2007, 2010 (international)
+        # outlook 2007, 2010 (international)
         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
         "padding:3.0pt 0cm 0cm 0cm']|"
-        #outlook 2007, 2010 (american)
+        # outlook 2007, 2010 (american)
         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
         "padding:3.0pt 0in 0in 0in']|"
-        #outlook 2013 (international)
+        # outlook 2013 (international)
         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
         "padding:3.0pt 0cm 0cm 0cm']|"
-        #outlook 2013 (american)
+        # outlook 2013 (american)
         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
         "padding:3.0pt 0in 0in 0in']|"
-        #windows mail
+        # windows mail
         "//div[@style='padding-top: 5px; "
         "border-top-color: rgb(229, 229, 229); "
         "border-top-width: 1px; border-top-style: solid;']"
@@ -104,11 +102,11 @@ def cut_microsoft_quote(html_message):
 
     if splitter:
         splitter = splitter[0]
-        #outlook 2010
+        # outlook 2010
         if splitter == splitter.getparent().getchildren()[0]:
             splitter = splitter.getparent()
     else:
-        #outlook 2003
+        # outlook 2003
         splitter = html_message.xpath(
             "//div"
             "/div[@class='MsoNormal' and @align='center' "
@@ -138,7 +136,7 @@ def cut_microsoft_quote(html_message):
 def cut_by_id(html_message):
     found = False
     for quote_id in QUOTE_IDS:
-        quote = cssselect('#{}'.format(quote_id), html_message)
+        quote = cssselect("#{}".format(quote_id), html_message)
         if quote:
             found = True
             quote[0].getparent().remove(quote[0])
@@ -146,11 +144,12 @@ def cut_by_id(html_message):
 
 
 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    """Cuts the last non-nested blockquote with wrapping elements."""
     quote = html_message.xpath(
-        '(.//blockquote)'
+        "(.//blockquote)"
         '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
-        '[last()]')
+        "[last()]"
+    )
 
     if quote:
         quote = quote[0]
@@ -158,18 +157,12 @@ def cut_blockquote(html_message):
         return True
 
 
-def cut_from_block(html_message):
-    """Cuts div tag which wraps block starting with "From:"."""
-    # handle the case when From: block is enclosed in some tag
-    block = html_message.xpath(
-        ("//*[starts-with(mg:text_content(), 'From:')]|"
-         "//*[starts-with(mg:text_content(), 'Date:')]"))
-
+def block_cut_content(block):
     if block:
         block = block[-1]
         parent_div = None
         while block.getparent() is not None:
-            if block.tag == 'div':
+            if block.tag == "div":
                 parent_div = block
                 break
             block = block.getparent()
@@ -178,9 +171,10 @@ def cut_from_block(html_message):
             # In cases where removing this enclosing div will remove all
             # content, we should assume the quote is not enclosed in a tag.
             parent_div_is_all_content = (
-                maybe_body is not None and maybe_body.tag == 'body' and
-                len(maybe_body.getchildren()) == 1)
-
+                maybe_body is not None
+                and maybe_body.tag == "body"
+                and len(maybe_body.getchildren()) == 1
+            )
             if not parent_div_is_all_content:
                 parent = block.getparent()
                 next_sibling = block.getnext()
@@ -196,27 +190,44 @@ def cut_from_block(html_message):
                 # From block if no siblings)
                 if block is not None:
                     parent.remove(block)
-
                 return True
         else:
             return False
-    # handle the case when From: block goes right after e.g. <hr>
-    # and not enclosed in some tag
-    block = html_message.xpath(
-        ("//*[starts-with(mg:tail(), 'From:')]|"
-         "//*[starts-with(mg:tail(), 'Date:')]"))
+
+
+def block_cut_tail(block):
     if block:
         block = block[0]
 
-        if RE_FWD.match(block.getparent().text or ''):
+        if RE_FWD.match(block.getparent().text or ""):
             return False
-        
-        while(block.getnext() is not None):
+
+        while block.getnext() is not None:
             block.getparent().remove(block.getnext())
         block.getparent().remove(block)
         return True
 
 
+def cut_from_block(html_message):
+    """Cuts div tag which wraps block starting with "From:"."""
+    # handle the case when From: block is enclosed in some tag
+    block = html_message.xpath(
+        "//*[starts-with(mg:text_content(), 'From:')]|"
+        "//*[starts-with(mg:text_content(), 'Date:')]"
+    )
+    block_content = block_cut_content(block)
+    if isinstance(block_content, bool):
+        return block_content
+    # handle the case when From: block goes right after e.g. <hr>
+    # and not enclosed in some tag
+    block = html_message.xpath(
+        "//*[starts-with(mg:tail(), 'From:')]|" "//*[starts-with(mg:tail(), 'Date:')]"
+    )
+    block_tail = block_cut_tail(block)
+    if isinstance(block_tail, bool):
+        return block_tail
+
+
 def cut_zimbra_quote(html_message):
     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
     if zDivider:
diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py
index 6023368..e179a49 100644
--- a/extract_raw_content/text.py
+++ b/extract_raw_content/text.py
@@ -1,4 +1,4 @@
-import regex as re
+import re
 
 from . import constants as const
 from . import utils
@@ -9,7 +9,7 @@ def get_delimiter(msg_body):
     if delimiter:
         delimiter = delimiter.group()
     else:
-        delimiter = '\n'
+        delimiter = "\n"
     return delimiter
 
 
@@ -26,33 +26,35 @@ def mark_message_lines(lines):
     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
     'tsem'
     """
-    markers = ['e' for _ in lines]
+    markers = ["e" for _ in lines]
     i = 0
     while i < len(lines):
         if not lines[i].strip():
-            markers[i] = 'e'  # empty line
+            markers[i] = "e"  # empty line
         elif const.QUOT_PATTERN.match(lines[i]):
-            markers[i] = 'm'  # line with quotation marker
+            markers[i] = "m"  # line with quotation marker
         elif const.RE_FWD.match(lines[i]):
-            markers[i] = 'f'  # ---- Forwarded message ----
+            markers[i] = "f"  # ---- Forwarded message ----
         else:
             # in case splitter is spread across several lines
-            splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES]))
+            splitter = utils.is_splitter(
+                "\n".join(lines[i : i + const.SPLITTER_MAX_LINES])
+            )
 
             if splitter:
                 # append as many splitter markers as lines in splitter
                 splitter_lines = splitter.group().splitlines()
                 for j in range(len(splitter_lines)):
-                    markers[i + j] = 's'
+                    markers[i + j] = "s"
 
                 # skip splitter lines
                 i += len(splitter_lines) - 1
             else:
                 # probably the line from the last message in the conversation
-                markers[i] = 't'
+                markers[i] = "t"
         i += 1
 
-    return ''.join(markers)
+    return "".join(markers)
 
 
 def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
@@ -66,41 +68,42 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
     return_flags = [were_lines_deleted, first_deleted_line,
                     last_deleted_line]
     """
-    markers = ''.join(markers)
+    markers = "".join(markers)
     # if there are no splitter there should be no markers
-    if 's' not in markers and not re.search('(me*){3}', markers):
-        markers = markers.replace('m', 't')
+    if "s" not in markers and not re.search("(me*){3}", markers):
+        markers = markers.replace("m", "t")
 
-    if re.match('[te]*f', markers):
+    if re.match("[te]*f", markers):
         return_flags[:] = [False, -1, -1]
         return lines
 
     # inlined reply
     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
     # both 't' entries should be found
-    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+    for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers):
         # long links could break sequence of quotation lines but they shouldn't
         # be considered an inline reply
-        links = (
-            const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
-            const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        links = const.RE_PARENTHESIS_LINK.search(
+            lines[inline_reply.start() - 1]
+        ) or const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())
         if not links:
             return_flags[:] = [False, -1, -1]
             return lines
 
     # cut out text lines coming after splitter if there are no markers there
-    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    quotation = re.search("(se*)+((t|f)+e*)+", markers)
     if quotation:
         return_flags[:] = [True, quotation.start(), len(lines)]
-        return lines[:quotation.start()]
+        return lines[: quotation.start()]
 
     # handle the case with markers
-    quotation = (const.RE_QUOTATION.search(markers) or
-                 const.RE_EMPTY_QUOTATION.search(markers))
+    quotation = const.RE_QUOTATION.search(markers) or const.RE_EMPTY_QUOTATION.search(
+        markers
+    )
 
     if quotation:
         return_flags[:] = True, quotation.start(1), quotation.end(1)
-        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+        return lines[: quotation.start(1)] + lines[quotation.end(1) :]
 
     return_flags[:] = [False, -1, -1]
     return lines
@@ -111,17 +114,15 @@ def postprocess(msg_body):
 
     Replace link brackets back to '<' and '>'.
     """
-    return re.sub(const.RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
+    return re.sub(const.RE_NORMALIZED_LINK, r"<\1>", msg_body).strip()
 
 
 def extract_from_plain(msg_body):
     """Extracts a non quoted message from provided plain text."""
-    stripped_text = msg_body
-
     delimiter = get_delimiter(msg_body)
     msg_body = utils.preprocess(msg_body, delimiter)
     # don't process too long messages
-    lines = msg_body.splitlines()[:const.MAX_LINES_COUNT]
+    lines = msg_body.splitlines()[: const.MAX_LINES_COUNT]
     markers = mark_message_lines(lines)
     lines = process_marked_lines(lines, markers)
 
diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py
index cabc18a..5cce3e4 100644
--- a/extract_raw_content/utils.py
+++ b/extract_raw_content/utils.py
@@ -1,4 +1,4 @@
-import regex as re
+import re
 from lxml import etree
 
 from . import constants as const
@@ -11,16 +11,16 @@ def text_content(context):
 
 def tail(context):
     """XPath Extension function to return a node tail text."""
-    return context.context_node.tail or ''
+    return context.context_node.tail or ""
 
 
 def register_xpath_extensions():
     ns = etree.FunctionNamespace("http://mailgun.net")
-    ns.prefix = 'mg'
-    ns['text_content'] = text_content
-    ns['tail'] = tail
-    
-    
+    ns.prefix = "mg"
+    ns["text_content"] = text_content
+    ns["tail"] = tail
+
+
 def _replace_link_brackets(msg_body):
     """
     Normalize links i.e. replace '<', '>' wrapping the link with some symbols
@@ -30,36 +30,38 @@ def _replace_link_brackets(msg_body):
     Converts msg_body into a unicode
     """
     if isinstance(msg_body, bytes):
-        msg_body = msg_body.decode('utf8')
+        msg_body = msg_body.decode("utf8")
 
     def link_wrapper(link):
-        newline_index = msg_body[:link.start()].rfind("\n")
+        newline_index = msg_body[: link.start()].rfind("\n")
         if msg_body[newline_index + 1] == ">":
             return link.group()
         else:
             return "@@%s@@" % link.group(1)
+
     msg_body = re.sub(const.RE_LINK, link_wrapper, msg_body)
     return msg_body
 
 
-def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
+def _wrap_splitter_with_newline(msg_body, delimiter, content_type="text/plain"):
     """
     Splits line in two if splitter pattern preceded by some text on the same
     line (done only for 'On <date> <person> wrote:' pattern.
     """
+
     def splitter_wrapper(splitter):
         """Wraps splitter with new line"""
-        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
-            return '%s%s' % (delimiter, splitter.group())
+        if splitter.start() and msg_body[splitter.start() - 1] != "\n":
+            return "{}{}".format(delimiter, splitter.group())
         else:
             return splitter.group()
 
-    if content_type == 'text/plain':
+    if content_type == "text/plain":
         msg_body = re.sub(const.RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
     return msg_body
 
 
-def preprocess(msg_body, delimiter, content_type='text/plain'):
+def preprocess(msg_body, delimiter, content_type="text/plain"):
     """Prepares msg_body for being stripped.
 
     Replaces link brackets so that they couldn't be taken for quotation marker.
@@ -74,10 +76,10 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
 
 
 def is_splitter(line):
-    '''
+    """
     Returns Matcher object if provided string is a splitter and
     None otherwise.
-    '''
+    """
     for pattern in const.SPLITTER_PATTERNS:
         matcher = re.match(pattern, line)
         if matcher:
diff --git a/mail_parser.py b/mail_parser.py
index b63a97a..432a345 100644
--- a/mail_parser.py
+++ b/mail_parser.py
@@ -6,14 +6,14 @@
 import re
 import uuid
 from io import BytesIO
+
 import mailparser
 from html2text import html2text
 
-from extract_raw_content.text import extract_from_plain
 from extract_raw_content.html import extract_from_html
+from extract_raw_content.text import extract_from_plain
 from extract_raw_content.utils import register_xpath_extensions
 
-
 decoder_map = {
     "base64": base64.b64decode,
     "": lambda payload: payload.encode("utf-8"),

From 754ab2884be3ff8c3a422d14ee2f211b2eff373a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Tue, 23 Apr 2024 11:59:35 +0200
Subject: [PATCH 3/6] Aktualizacja requirements.txt i zmiana wersji
 (version.py)

---
 requirements.txt | 34 ++++------------------------------
 version.py       |  2 +-
 2 files changed, 5 insertions(+), 31 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cf56c79..89e97a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,34 +1,8 @@
-asttokens==2.4.1
-certifi==2024.2.2
-charset-normalizer==3.3.2
-cssselect==1.2.0
-decorator==5.1.1
-exceptiongroup==1.2.0
-executing==2.0.1
 html2text==2024.2.26
-html5lib==1.1
-idna==3.7
-ipdb==0.13.13
-ipython==8.23.0
-jedi==0.19.1
-lxml==5.2.1
 mail-parser==3.15.0
-matplotlib-inline==0.1.6
-parso==0.8.4
-pexpect==4.9.0
-prompt-toolkit==3.0.43
-ptyprocess==0.7.0
-pure-eval==0.2.2
-Pygments==2.17.2
-regex==2023.12.25
 requests==2.31.0
 sentry-sdk==1.43.0
-simplejson==3.19.2
-six==1.16.0
-stack-data==0.6.3
-tomli==2.0.1
-traitlets==5.14.2
-typing_extensions==4.11.0
-urllib3==2.2.1
-wcwidth==0.2.13
-webencodings==0.5.1
+ipdb==0.13.13
+html5lib==1.1
+lxml==5.2.1
+cssselect==1.2.0
\ No newline at end of file
diff --git a/version.py b/version.py
index 38e7dcc..1cd440f 100644
--- a/version.py
+++ b/version.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.02"
+__version__ = "1.1.0"
 
 if __name__ == "__main__":
     print(f"v{__version__}")

From 247b138dcec7399150e90bf5162644b711aff382 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Tue, 23 Apr 2024 15:10:43 +0200
Subject: [PATCH 4/6] Przeniesienie funkcji _html5lib_parser do pliku utils

---
 extract_raw_content/html.py  |  21 +-------
 extract_raw_content/utils.py | 100 +++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 19 deletions(-)

diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py
index bc1f4b6..a1379d1 100644
--- a/extract_raw_content/html.py
+++ b/extract_raw_content/html.py
@@ -1,7 +1,6 @@
+import re
 from copy import deepcopy
 
-import html5lib
-import re
 from lxml import html
 from lxml.cssselect import CSSSelector
 from lxml.html import html5parser
@@ -102,21 +101,6 @@ def mark_message_lines(lines):
     return "".join(markers)
 
 
-def _html5lib_parser():
-    """
-    html5lib is a pure-python library that conforms to the WHATWG HTML spec
-    and is not vulnarable to certain attacks common for XML libraries
-    """
-    return html5lib.HTMLParser(
-        # build lxml tree
-        html5lib.treebuilders.getTreeBuilder("lxml"),
-        # remove namespace value from inside lxml.html.html5paser element tag
-        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
-        # instead of "div", throwing the algo off
-        namespaceHTMLElements=False,
-    )
-
-
 def _rm_excessive_newlines(s):
     """Remove excessive newlines that often happen due to tons of divs"""
     return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
@@ -140,8 +124,7 @@ def html_document_fromstring(s):
     try:
         if html_too_big(s):
             return None
-
-        return html5parser.document_fromstring(s, parser=_html5lib_parser())
+        return html5parser.document_fromstring(s, parser=utils._html5lib_parser())
     except Exception:
         pass
 
diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py
index 5cce3e4..eb96578 100644
--- a/extract_raw_content/utils.py
+++ b/extract_raw_content/utils.py
@@ -1,8 +1,18 @@
 import re
+
+import html5lib
 from lxml import etree
+from lxml.cssselect import CSSSelector
+from lxml.html import html5parser
 
 from . import constants as const
 
+_UTF8_DECLARATION = (
+    '<meta http-equiv="Content-Type" content="text/html;' 'charset=utf-8">'
+)
+_BLOCKTAGS = ["div", "p", "ul", "li", "h1", "h2", "h3"]
+_HARDBREAKS = ["br", "hr", "tr"]
+
 
 def text_content(context):
     """XPath Extension function to return a node text content."""
@@ -84,3 +94,93 @@ def is_splitter(line):
         matcher = re.match(pattern, line)
         if matcher:
             return matcher
+
+
+def _html5lib_parser():
+    """
+    html5lib is a pure-python library that conforms to the WHATWG HTML spec
+    and is not vulnarable to certain attacks common for XML libraries
+    """
+    return html5lib.HTMLParser(
+        # build lxml tree
+        html5lib.treebuilders.getTreeBuilder("lxml"),
+        # remove namespace value from inside lxml.html.html5paser element tag
+        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
+        # instead of "div", throwing the algo off
+        namespaceHTMLElements=False,
+    )
+
+
+def _contains_charset_spec(s: str) -> str:
+    """Return True if the first 4KB contain charset spec"""
+    return s.lower().find("html; charset=", 0, 4096) != -1
+
+
+def _rm_excessive_newlines(s: str) -> str:
+    """Remove excessive newlines that often happen due to tons of divs"""
+    return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
+
+
+def _prepend_utf8_declaration(s: str) -> str:
+    """Prepend 'utf-8' encoding declaration if the first 4KB don't have any"""
+    return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
+
+
+def html_fromstring(s: str) -> etree._Element:
+    """Parse html tree from string. Return None if the string can't be parsed."""
+    return html5parser.fromstring(s, parser=_html5lib_parser())
+
+
+def html_tree_to_text(tree: etree._Element) -> str:
+    for style in CSSSelector("style")(tree):
+        style.getparent().remove(style)
+
+    for c in tree.xpath("//comment()"):
+        parent = c.getparent()
+
+        # comment with no parent does not impact produced text
+        if parent is None:
+            continue
+
+        parent.remove(c)
+
+    text = ""
+    for el in tree.iter():
+        el_text = (el.text or "") + (el.tail or "")
+        if len(el_text) > 1:
+            if el.tag in _BLOCKTAGS + _HARDBREAKS:
+                text += "\n"
+            if el.tag == "li":
+                text += "  * "
+            text += el_text.strip() + " "
+
+            # add href to the output
+            href = el.attrib.get("href")
+            if href:
+                text += "(%s) " % href
+
+        if el.tag in _HARDBREAKS and text and not text.endswith("\n") and not el_text:
+            text += "\n"
+
+    text = _rm_excessive_newlines(text)
+    return text
+
+
+def html_to_text(s: str) -> str | None:
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        <<< "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        3. if html can't be parsed returns None
+    """
+    s = _prepend_utf8_declaration(s)
+    s = s.replace("\n", "")
+    tree = html_fromstring(s)
+
+    if tree is None:
+        return None
+
+    return html_tree_to_text(tree)

From 83f47c36b98f0d3581d806791df7e4bbaaac0b02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Tue, 23 Apr 2024 15:11:23 +0200
Subject: [PATCH 5/6] =?UTF-8?q?Dodanie=20test=C3=B3w=20do=20funkcji=20prze?=
 =?UTF-8?q?niesionych=20z=20biblioteki=20talon?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 extract_raw_content/text.py               |    2 +-
 mails/OLK_SRC_BODY_SECTION.html           |   16 +
 mails/html_replies/gmail.html             |    6 +
 mails/html_replies/hotmail.html           |   18 +
 mails/html_replies/mail_ru.html           |   57 +
 mails/html_replies/ms_outlook_2003.html   |  134 +++
 mails/html_replies/ms_outlook_2007.html   |   42 +
 mails/html_replies/ms_outlook_2010.html   |   87 ++
 mails/html_replies/thunderbird.html       |   32 +
 mails/html_replies/windows_mail.html      |   33 +
 mails/html_replies/yandex_ru.html         |    1 +
 mails/reply-quotations-share-block.eml    |   22 +
 mails/reply-separated-by-hr.html          |   21 +
 mails/standard_replies/android.eml        |   24 +
 mails/standard_replies/aol.eml            |   65 ++
 mails/standard_replies/apple_mail.eml     |   15 +
 mails/standard_replies/apple_mail_2.eml   |   19 +
 mails/standard_replies/comcast.eml        |   33 +
 mails/standard_replies/gmail.eml          |   31 +
 mails/standard_replies/hotmail.eml        |   50 +
 mails/standard_replies/iphone.eml         |   19 +
 mails/standard_replies/iphone_reply_text  |    3 +
 mails/standard_replies/outlook.eml        |   85 ++
 mails/standard_replies/sparrow.eml        |   61 +
 mails/standard_replies/sparrow_reply_text |    5 +
 mails/standard_replies/thunderbird.eml    |   15 +
 mails/standard_replies/yahoo.eml          |   22 +
 test.py                                   | 1223 +++++++++++++++++++++
 28 files changed, 2140 insertions(+), 1 deletion(-)
 create mode 100644 mails/OLK_SRC_BODY_SECTION.html
 create mode 100644 mails/html_replies/gmail.html
 create mode 100644 mails/html_replies/hotmail.html
 create mode 100644 mails/html_replies/mail_ru.html
 create mode 100644 mails/html_replies/ms_outlook_2003.html
 create mode 100644 mails/html_replies/ms_outlook_2007.html
 create mode 100644 mails/html_replies/ms_outlook_2010.html
 create mode 100644 mails/html_replies/thunderbird.html
 create mode 100644 mails/html_replies/windows_mail.html
 create mode 100644 mails/html_replies/yandex_ru.html
 create mode 100644 mails/reply-quotations-share-block.eml
 create mode 100644 mails/reply-separated-by-hr.html
 create mode 100644 mails/standard_replies/android.eml
 create mode 100644 mails/standard_replies/aol.eml
 create mode 100644 mails/standard_replies/apple_mail.eml
 create mode 100644 mails/standard_replies/apple_mail_2.eml
 create mode 100644 mails/standard_replies/comcast.eml
 create mode 100644 mails/standard_replies/gmail.eml
 create mode 100644 mails/standard_replies/hotmail.eml
 create mode 100644 mails/standard_replies/iphone.eml
 create mode 100644 mails/standard_replies/iphone_reply_text
 create mode 100644 mails/standard_replies/outlook.eml
 create mode 100644 mails/standard_replies/sparrow.eml
 create mode 100644 mails/standard_replies/sparrow_reply_text
 create mode 100644 mails/standard_replies/thunderbird.eml
 create mode 100644 mails/standard_replies/yahoo.eml

diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py
index e179a49..4e4e4a1 100644
--- a/extract_raw_content/text.py
+++ b/extract_raw_content/text.py
@@ -80,7 +80,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
     # inlined reply
     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
     # both 't' entries should be found
-    for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers):
+    for inline_reply in re.finditer("(?<=m)e*(t[te]*)m", markers):
         # long links could break sequence of quotation lines but they shouldn't
         # be considered an inline reply
         links = const.RE_PARENTHESIS_LINK.search(
diff --git a/mails/OLK_SRC_BODY_SECTION.html b/mails/OLK_SRC_BODY_SECTION.html
new file mode 100644
index 0000000..82f3689
--- /dev/null
+++ b/mails/OLK_SRC_BODY_SECTION.html
@@ -0,0 +1,16 @@
+<html>
+  <body>
+    <div>Reply</div>
+    <span id="OLK_SRC_BODY_SECTION">
+      <div>
+	<span>From: </span>Bob &lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;<br>
+	<span>Date: </span>Tue, 01 Nov 2011 18:54:39 -0700<br>
+	<span>To: </span>Rob &lt;<a href="mailto:rob@example.com">rob@example.com</a>&gt;<br>
+	<span>Subject: </span>Test<br>
+      </div>
+      <div>
+	Hi
+      </div>
+    </span>
+  </body>
+</html>
diff --git a/mails/html_replies/gmail.html b/mails/html_replies/gmail.html
new file mode 100644
index 0000000..7bc7cf5
--- /dev/null
+++ b/mails/html_replies/gmail.html
@@ -0,0 +1,6 @@
+<div dir="ltr"><div class="gmail_default"><div class="gmail_default" style>Hi. I am fine.</div><div class="gmail_default" style><br></div><div class="gmail_default" style>Thanks,</div><div class="gmail_default" style>Alex</div>
+</div></div><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Jun 26, 2014 at 2:14 PM, Alexander L <span dir="ltr">&lt;<a href="mailto:abc@example.com" target="_blank">a@example.com</a>&gt;</span> wrote:<br>
+<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><div class="gmail_default" style="font-size:small"><div class="gmail_default" style="font-family:arial,sans-serif">
+Hello! How are you?</div><div class="gmail_default" style="font-family:arial,sans-serif"><br>
+</div><div class="gmail_default" style="font-family:arial,sans-serif">Thanks,</div><div class="gmail_default" style="font-family:arial,sans-serif">Sasha.</div></div></div>
+</blockquote></div><br></div>
diff --git a/mails/html_replies/hotmail.html b/mails/html_replies/hotmail.html
new file mode 100644
index 0000000..0257b2e
--- /dev/null
+++ b/mails/html_replies/hotmail.html
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<html>
+<head>
+<style><!--
+.hmmessage P
+{
+margin:0px;
+padding:0px
+}
+body.hmmessage
+{
+font-size: 12pt;
+font-family:Calibri
+}
+--></style></head>
+<body class='hmmessage'><div dir='ltr'>Hi. I am fine.<div><br></div><div>Thanks,</div><div>Alex<br><br><div><hr id="stopSpelling">Date: Thu, 26 Jun 2014 13:53:45 +0400<br>Subject: Test message<br>From: abc@example.com<br>To: alex.l@example.com<br><br><div dir="ltr"><div class="ecxgmail_default" style="font-size:small;">Hello! How are you?</div><div class="ecxgmail_default" style="font-size:small;"><br></div><div class="ecxgmail_default" style="font-size:small;">Thanks,</div><div class="ecxgmail_default" style="font-size:small;">
+Sasha.</div></div></div></div> 		 	   		  </div></body>
+</html>
diff --git a/mails/html_replies/mail_ru.html b/mails/html_replies/mail_ru.html
new file mode 100644
index 0000000..52d7039
--- /dev/null
+++ b/mails/html_replies/mail_ru.html
@@ -0,0 +1,57 @@
+
+<HTML><BODY><p>Hi. I am fine.</p><p>Thanks,<br>Alex</p><br><br><br>Thu, 26 Jun 2014 14:00:51 +0400 от Alexander L &lt;abc@example.com&gt;:<br>
+<blockquote style="border-left:1px solid #0857A6; margin:10px; padding:0 0 0 10px;">
+	<div id="">
+	
+
+
+
+    
+
+
+
+
+
+
+
+
+
+	
+	
+
+
+	
+	
+	
+	
+	
+
+	
+	
+
+	
+	
+
+
+
+<div class="js-helper js-readmsg-msg">
+	<style type="text/css"></style>
+ 	<div>
+		<base target="_self" href="https://e.mail.ru/">
+		
+			<div id="style_14037768550000001020_BODY"><div dir="ltr"><div style="font-size:small"><div style="font-family:arial,sans-serif">Hello! How are you?</div><div style="font-family:arial,sans-serif"><br>
+</div><div style="font-family:arial,sans-serif">Thanks,</div><div style="font-family:arial,sans-serif">Sasha.</div></div></div>
+
+</div>
+			
+		
+		<base target="_self" href="https://e.mail.ru/">
+	</div>
+
+	
+</div>
+
+
+</div>
+</blockquote>
+<br></BODY></HTML>
diff --git a/mails/html_replies/ms_outlook_2003.html b/mails/html_replies/ms_outlook_2003.html
new file mode 100644
index 0000000..027c525
--- /dev/null
+++ b/mails/html_replies/ms_outlook_2003.html
@@ -0,0 +1,134 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii">
+<meta name=Generator content="Microsoft Word 11 (filtered medium)">
+<!--[if !mso]>
+<style>
+v\:* {behavior:url(#default#VML);}
+o\:* {behavior:url(#default#VML);}
+w\:* {behavior:url(#default#VML);}
+.shape {behavior:url(#default#VML);}
+</style>
+<![endif]-->
+<style>
+<!--
+ /* Font Definitions */
+ @font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+ /* Style Definitions */
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0cm;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman";}
+a:link, span.MsoHyperlink
+	{color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{color:purple;
+	text-decoration:underline;}
+span.EmailStyle17
+	{mso-style-type:personal-reply;
+	font-family:Arial;
+	color:navy;}
+@page Section1
+	{size:595.3pt 841.9pt;
+	margin:2.0cm 42.5pt 2.0cm 3.0cm;}
+div.Section1
+	{page:Section1;}
+-->
+</style>
+<!--[if gte mso 9]><xml>
+ <o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+ <o:shapelayout v:ext="edit">
+  <o:idmap v:ext="edit" data="1" />
+ </o:shapelayout></xml><![endif]-->
+</head>
+
+<body lang=RU link=blue vlink=purple>
+
+<div class=Section1>
+
+<p class=MsoNormal><font size=2 color=navy face=Arial><span lang=EN-US
+style='font-size:10.0pt;font-family:Arial;color:navy'>Hi. I am fine.<o:p></o:p></span></font></p>
+
+<p class=MsoNormal><font size=2 color=navy face=Arial><span lang=EN-US
+style='font-size:10.0pt;font-family:Arial;color:navy'><o:p>&nbsp;</o:p></span></font></p>
+
+<p class=MsoNormal><font size=2 color=navy face=Arial><span lang=EN-US
+style='font-size:10.0pt;font-family:Arial;color:navy'>Thanks,<o:p></o:p></span></font></p>
+
+<p class=MsoNormal><font size=2 color=navy face=Arial><span lang=EN-US
+style='font-size:10.0pt;font-family:Arial;color:navy'>Alex<o:p></o:p></span></font></p>
+
+<p class=MsoNormal><font size=2 color=navy face=Arial><span style='font-size:
+10.0pt;font-family:Arial;color:navy'><o:p>&nbsp;</o:p></span></font></p>
+
+<div>
+
+<div class=MsoNormal align=center style='text-align:center'><font size=3
+face="Times New Roman"><span lang=EN-US style='font-size:12.0pt'>
+
+<hr size=3 width="100%" align=center tabindex=-1>
+
+</span></font></div>
+
+<p class=MsoNormal><b><font size=2 face=Tahoma><span lang=EN-US
+style='font-size:10.0pt;font-family:Tahoma;font-weight:bold'>From:</span></font></b><font
+size=2 face=Tahoma><span lang=EN-US style='font-size:10.0pt;font-family:Tahoma'>
+Alexander L [mailto:abc@example.com] <br>
+<b><span style='font-weight:bold'>Sent:</span></b> Friday, June 27, 2014 12:06
+PM<br>
+<b><span style='font-weight:bold'>To:</span></b> Alexander<br>
+<b><span style='font-weight:bold'>Subject:</span></b> Test message</span></font><span
+lang=EN-US><o:p></o:p></span></p>
+
+</div>
+
+<p class=MsoNormal><font size=3 face="Times New Roman"><span style='font-size:
+12.0pt'><o:p>&nbsp;</o:p></span></font></p>
+
+<div>
+
+<div>
+
+<div>
+
+<p class=MsoNormal><font size=3 face=Arial><span style='font-size:12.0pt;
+font-family:Arial'>Hello! How are you?<o:p></o:p></span></font></p>
+
+</div>
+
+<div>
+
+<p class=MsoNormal><font size=3 face=Arial><span style='font-size:12.0pt;
+font-family:Arial'><o:p>&nbsp;</o:p></span></font></p>
+
+</div>
+
+<div>
+
+<p class=MsoNormal><font size=3 face=Arial><span style='font-size:12.0pt;
+font-family:Arial'>Thanks,<o:p></o:p></span></font></p>
+
+</div>
+
+<div>
+
+<p class=MsoNormal><font size=3 face=Arial><span style='font-size:12.0pt;
+font-family:Arial'>Sasha.<o:p></o:p></span></font></p>
+
+</div>
+
+</div>
+
+</div>
+
+</div>
+
+</body>
+
+</html>
diff --git a/mails/html_replies/ms_outlook_2007.html b/mails/html_replies/ms_outlook_2007.html
new file mode 100644
index 0000000..84f7ede
--- /dev/null
+++ b/mails/html_replies/ms_outlook_2007.html
@@ -0,0 +1,42 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><meta http-equiv=Content-Type content="text/html; charset=utf-8"><meta name=Generator content="Microsoft Word 12 (filtered medium)"><style><!--
+/* Font Definitions */
+@font-face
+	{font-family:"Cambria Math";
+	panose-1:2 4 5 3 5 4 6 3 2 4;}
+@font-face
+	{font-family:Calibri;
+	panose-1:2 15 5 2 2 2 4 3 2 4;}
+@font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0cm;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+a:link, span.MsoHyperlink
+	{mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;}
+span.EmailStyle17
+	{mso-style-type:personal-reply;
+	font-family:"Calibri","sans-serif";
+	color:#1F497D;}
+.MsoChpDefault
+	{mso-style-type:export-only;}
+@page WordSection1
+	{size:612.0pt 792.0pt;
+	margin:2.0cm 42.5pt 2.0cm 3.0cm;}
+div.WordSection1
+	{page:WordSection1;}
+--></style><!--[if gte mso 9]><xml>
+<o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<o:shapelayout v:ext="edit">
+<o:idmap v:ext="edit" data="1" />
+</o:shapelayout></xml><![endif]--></head><body lang=EN-US link=blue vlink=purple><div class=WordSection1><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Hi. I am fine.<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'> <o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Thanks,<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Alex<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><o:p>&nbsp;</o:p></span></p><div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm'><p class=MsoNormal><b><span lang=RU style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span lang=RU style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'> Alexander L [mailto:abc@example.com] <br><b>Sent:</b> Thursday, July 03, 2014 3:50 PM<br><b>To:</b> alex.l@example.com<br><b>Subject:</b> Test message<o:p></o:p></span></p></div><p class=MsoNormal><o:p>&nbsp;</o:p></p><div><div><div><p class=MsoNormal><span style='font-family:"Arial","sans-serif"'>Hello! How are you?<o:p></o:p></span></p></div><div><p class=MsoNormal><span style='font-family:"Arial","sans-serif"'><o:p>&nbsp;</o:p></span></p></div><div><p class=MsoNormal><span style='font-family:"Arial","sans-serif"'>Thanks,<o:p></o:p></span></p></div><div><p class=MsoNormal><span style='font-family:"Arial","sans-serif"'>Sasha.<o:p></o:p></span></p></div></div></div></div></body></html>
diff --git a/mails/html_replies/ms_outlook_2010.html b/mails/html_replies/ms_outlook_2010.html
new file mode 100644
index 0000000..9d26d0e
--- /dev/null
+++ b/mails/html_replies/ms_outlook_2010.html
@@ -0,0 +1,87 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
+<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
+<style><!--
+/* Font Definitions */
+@font-face
+	{font-family:Calibri;
+	panose-1:2 15 5 2 2 2 4 3 2 4;}
+@font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0in;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+h3
+	{mso-style-priority:9;
+	mso-style-link:"Heading 3 Char";
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:13.5pt;
+	font-family:"Times New Roman","serif";
+	font-weight:bold;}
+a:link, span.MsoHyperlink
+	{mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;}
+p
+	{mso-style-priority:99;
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+span.Heading3Char
+	{mso-style-name:"Heading 3 Char";
+	mso-style-priority:9;
+	mso-style-link:"Heading 3";
+	font-family:"Cambria","serif";
+	color:#4F81BD;
+	font-weight:bold;}
+span.EmailStyle19
+	{mso-style-type:personal-reply;
+	font-family:"Calibri","sans-serif";
+	color:#1F497D;}
+.MsoChpDefault
+	{mso-style-type:export-only;
+	font-family:"Calibri","sans-serif";}
+@page WordSection1
+	{size:8.5in 11.0in;
+	margin:1.0in 1.0in 1.0in 1.0in;}
+div.WordSection1
+	{page:WordSection1;}
+--></style><!--[if gte mso 9]><xml>
+<o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<o:shapelayout v:ext="edit">
+<o:idmap v:ext="edit" data="1" />
+</o:shapelayout></xml><![endif]-->
+</head>
+<body lang="EN-US" link="blue" vlink="purple">
+<div class="WordSection1">
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
+<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
+<b>On Behalf Of </b>baz@bar.com<br>
+<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
+<b>To:</b> john@bar.com<br>
+<b>Cc:</b> jane@bar.io<br>
+<b>Subject:</b> Conversation<o:p></o:p></span></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+<p>Hello! How are you?<o:p></o:p></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+</div>
+</body>
+</html>
diff --git a/mails/html_replies/thunderbird.html b/mails/html_replies/thunderbird.html
new file mode 100644
index 0000000..9d871cb
--- /dev/null
+++ b/mails/html_replies/thunderbird.html
@@ -0,0 +1,32 @@
+<html>
+  <head>
+    <meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
+  </head>
+  <body bgcolor="#FFFFFF" text="#000000">
+    Hi. I am fine.<br>
+    <br>
+    Thanks,<br>
+    Alex<br>
+    <div class="moz-cite-prefix">On 26.06.2014 14:41, Alexander L
+      wrote:<br>
+    </div>
+    <blockquote
+cite="mid:CA+jEWTKBU6qc4OnH5m=-0sfwkAzZhcy0rd+ean2W6bFUVXaO7A@mail.gmail.com"
+      type="cite">
+      <div dir="ltr">
+        <div class="gmail_default" style="font-size:small">
+          <div class="gmail_default"
+            style="font-family:arial,sans-serif">Hello! How are you?</div>
+          <div class="gmail_default"
+            style="font-family:arial,sans-serif"><br>
+          </div>
+          <div class="gmail_default"
+            style="font-family:arial,sans-serif">Thanks,</div>
+          <div class="gmail_default"
+            style="font-family:arial,sans-serif">Sasha.</div>
+        </div>
+      </div>
+    </blockquote>
+    <br>
+  </body>
+</html>
diff --git a/mails/html_replies/windows_mail.html b/mails/html_replies/windows_mail.html
new file mode 100644
index 0000000..9100ea1
--- /dev/null
+++ b/mails/html_replies/windows_mail.html
@@ -0,0 +1,33 @@
+
+<html>
+<head>
+<meta name="generator" content="Windows Mail 17.5.9600.20498">
+<style data-externalstyle="true"><!--
+p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph {
+margin-top:0in;
+margin-right:0in;
+margin-bottom:0in;
+margin-left:.5in;
+margin-bottom:.0001pt;
+}
+p.MsoNormal, li.MsoNormal, div.MsoNormal {
+margin:0in;
+margin-bottom:.0001pt;
+}
+p.MsoListParagraphCxSpFirst, li.MsoListParagraphCxSpFirst, div.MsoListParagraphCxSpFirst, 
+p.MsoListParagraphCxSpMiddle, li.MsoListParagraphCxSpMiddle, div.MsoListParagraphCxSpMiddle, 
+p.MsoListParagraphCxSpLast, li.MsoListParagraphCxSpLast, div.MsoListParagraphCxSpLast {
+margin-top:0in;
+margin-right:0in;
+margin-bottom:0in;
+margin-left:.5in;
+margin-bottom:.0001pt;
+line-height:115%;
+}
+--></style></head>
+<body dir="ltr">
+<div data-externalstyle="false" dir="ltr" style="font-family: 'Calibri', 'Segoe UI', 'Meiryo', 'Microsoft YaHei UI', 'Microsoft JhengHei UI', 'Malgun Gothic', 'sans-serif';font-size:12pt;"><div>Hi. I am fine.</div><div><br></div><div>Thanks,</div><div>Alex<br></div><div data-signatureblock="true"><div><br></div><div><br></div></div><div style="padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;"><div><font face=" 'Calibri', 'Segoe UI', 'Meiryo', 'Microsoft YaHei UI', 'Microsoft JhengHei UI', 'Malgun Gothic', 'sans-serif'" style='line-height: 15pt; letter-spacing: 0.02em; font-family: "Calibri", "Segoe UI", "Meiryo", "Microsoft YaHei UI", "Microsoft JhengHei UI", "Malgun Gothic", "sans-serif"; font-size: 12pt;'><b>От:</b>&nbsp;<a href="mailto:abc@example.com" target="_parent">Alexander L</a><br><b>Отправлено:</b>&nbsp;‎четверг‎, ‎26‎ ‎июня‎ ‎2014‎ г. ‎15‎:‎05<br><b>Кому:</b>&nbsp;<a href="mailto:alex-ninja@example.com" target="_parent">Alex</a></font></div></div><div><br></div><div dir=""><div dir="ltr"><div class="gmail_default" style="font-size: small;"><div class="gmail_default" style="font-family: arial,sans-serif;">Hello! How are you?</div><div class="gmail_default" style="font-family: arial,sans-serif;"><br>
+</div><div class="gmail_default" style="font-family: arial,sans-serif;">Thanks,</div><div class="gmail_default" style="font-family: arial,sans-serif;">Sasha.</div></div></div>
+</div></div>
+</body>
+</html>
diff --git a/mails/html_replies/yandex_ru.html b/mails/html_replies/yandex_ru.html
new file mode 100644
index 0000000..3847fb9
--- /dev/null
+++ b/mails/html_replies/yandex_ru.html
@@ -0,0 +1 @@
+<p>Hi. I am fine.<br /><br />Thanks,<br />Alex<br /><br />26.06.2014, 14:41, "Alexander L" &lt;<a href="mailto:abc@example.com">abc@example.com</a>&gt;:</p><blockquote> Hello! How are you?<br /><br /> Thanks,<br /> Sasha.</blockquote>
diff --git a/mails/reply-quotations-share-block.eml b/mails/reply-quotations-share-block.eml
new file mode 100644
index 0000000..1b9a21a
--- /dev/null
+++ b/mails/reply-quotations-share-block.eml
@@ -0,0 +1,22 @@
+Content-Type: multipart/alternative;
+ boundary="===============6853056845739363347=="
+MIME-Version: 1.0
+Date: Wed, 4 Apr 2012 22:22:42 -0700 (PDT)
+From: Joe Doe <xxx@example.com>
+Subject: Re: You've got a new booking inquiry!
+
+--===============6853056845739363347==
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+SGkgS2F0aGFyaW5lLsKgIFNvdW5kcyBncmVhdC7CoCBBcmUgdGhlcmUgYW5kIGRpZXRyeSByZXN0cmljdGlvbnMgb3IgdGhpbmdzIHlvdXIgaHVzYmFuZCBkb2VzL2RvZXNuJ3QgbGlrZSB0byBlYXQ/wqAgV291bGQgeW91IGxpa2UgdG8gZG8gYSBmZXcgaG9ycyBkIG9ldXZyZXMgYW5kIHRoZW4gYcKgMyBvciA0wqBjb3Vyc2UgZGlubmVyP8KgIExldCBtZSBrbm93IHdoYXQgeW91IHRoaW5rIHdpbGwgd29yayBiZXN0IGFuZCBJIHdpbGwgc3RhcnQgd29ya2luZyBvbiBhIG1lbnUgYW5kIHByb3Bvc2FsLsKgIFRoYW5rcyBzbyBtdWNoIGFuZCBsb29rIGZvcndhcmQgdG8gaGVhcmluZyBmcm9tIHlvdSBzb29uLgrCoApKb2UgWFhYCgotLS0gT24gV2VkLCA0LzQvMTIsIHh4eEBleGFtcGxlLmNvbSA8eHh4QGV4YW1wbGUuY29tPiB3cm90ZToKCgpGcm9tOiB4eHhAZXhhbXBsZS5jb20gPHh4eEBleGFtcGxlLmNvbT4KU3ViamVjdDogWW91J3ZlIGdvdCBhIG5ldyBib29raW5nIGlucXVpcnkhClRvOiB4eHhAeWFob28uY29tCkRhdGU6IFdlZG5lc2RheSwgQXByaWwgNCwgMjAxMiwgMTA6MjMgUE0KCk5ldyBCb29raW5nIElucXVpcnkKCg==
+
+--===============6853056845739363347==
+MIME-Version: 1.0
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+PHRhYmxlPjx0cj48dGQ+PERJVj5IaSBLYXRoYXJpbmUuJm5ic3A7IFNvdW5kcyBncmVhdC4mbmJzcDsgQXJlIHRoZXJlIGFuZCBkaWV0cnkgcmVzdHJpY3Rpb25zIG9yIHRoaW5ncyB5b3VyIGh1c2JhbmQgZG9lcy9kb2Vzbid0IGxpa2UgdG8gZWF0PyZuYnNwOyBXb3VsZCB5b3UgbGlrZSB0byBkbyBhIGZldyBob3JzIGQgb2V1dnJlcyBhbmQgdGhlbiBhJm5ic3A7MyBvciA0Jm5ic3A7Y291cnNlIGRpbm5lcj8mbmJzcDsgTGV0IG1lIGtub3cgd2hhdCB5b3UgdGhpbmsgd2lsbCB3b3JrIGJlc3QgYW5kIEkgd2lsbCBzdGFydCB3b3JraW5nIG9uIGEgbWVudSBhbmQgcHJvcG9zYWwuJm5ic3A7IFRoYW5rcyBzbyBtdWNoIGFuZCBsb29rIGZvcndhcmQgdG8gaGVhcmluZyBmcm9tIHlvdSBzb29uLjwvRElWPgo8RElWPiZuYnNwOzwvRElWPgo8RElWPkpob24gRG9lPEJSPjxCUj4tLS0gT24gPEI+V2VkLCA0LzQvMTIsIHh4eEBleGFtcGxlLmNvbSA8ST4mbHQ7eHh4QGV4YW1wbGUuY29tJmd0OzwvST48L0I+IHdyb3RlOjxCUj48L0RJVj4KPEJMT0NLUVVPVEU+PEJSPkZyb206IHh4eEBleGFtcGxlLmNvbSAmbHQ7eHh4QGV4YW1wbGUuY29tJmd0OzxCUj5TdWJqZWN0OiBZb3UndmUgZ290IGEgbmV3IGJvb2tpbmcgaW5xdWlyeSE8QlI+VG86IHh4eEB5YWhvby5jb208QlI+RGF0ZTogV2VkbmVzZGF5LCBBcHJpbCA0LCAyMDEyLCAxMDoyMyBQTTxCUj48QlI+CjxESVY+CjxESVY+CjxDRU5URVI+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPgo8VEFCTEU+CjxUQk9EWT4KPFRSPgo8VEQ+CjxESVY+TmV3IEJvb2tpbmcgSW5xdWlyeSA8L0RJVj48L1REPgo8VEQ+CjxESVY+WW91ciBwbGFjZSBpcyB0aGUgaG9tZSBvZiBiZXNwb2tlIGRpbmluZyA8L0RJVj48L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPiA8L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj4KPFRSPgo8VEQ+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPgo8RElWPjxCUj5Hb29kIE5ld3MhPEJSPjxCUj4KPFA+RXZlbnQgRGV0YWlsczwvRElWPkRhdGU6IEFwcmlsIDI4LCAyMDEyPEJSPkxvY2F0aW9uOiB4eHg8QlI+SGVhZGNvdW50OiA2IHRvIDg8QlI+VGFyZ2V0IEJ1ZGdldDogJDUwIHBlciBwZXJzb248QlI+PEJSPkJlc3QgRGVzY3JpcHRpb24gb2YgVGFyZ2V0IEJ1ZGdldDogSSdkIGxvdmUgdG8gaGVhciB3aGF0IHRoZSBjaGVmIHRoaW5rcyBpcyBiZXN0IGZvciBteSBldmVudCwgcHJvdmlkZWQgd2Ugc3RheSBjbG9zZSB0byB0aGlzIGJ1ZGdldCA8QlI+PEJSPkV2ZW50IERlc2NyaXB0aW9uOiBJIGFtIHdhbnRpbmcgdG8gc3VycHJpc2UgbXkgaHVzYmFuZCB3aXRoIGEgY2FzdWFsIGRpbm5lciBwYXJ0eSBpbiBvdXIgaG9tZSBpbiB4eHguIFdlIGhhdmUgYW4gYW1hemluZyBraXRjaGVuICh0aGF0IEkgZG9uJ3QgZG8ganVzdGljZSB0byBidXQgSSBiZXQgeW91IGNvdWxkISksIGFuZCBhIHJlYWxseSBuaWNlIGdhcmRlbiBmb3IgZGluaW5nLiBJIGFtIGZseWluZyBzb21lIG9mIGhpcyBiZXN0IGZyaWVuZHMgaW4gdG8gY2VsZWJyYXRlIGhpbS4gV2UgaGF2ZSBzbWFsbCBraWRzICh3aG8gd2lsbCBiZSBzbGVlcGluZyEpLCBzbyBJJ20gaG9waW5nIGZvciBhIGNhc3VhbCBidXQgcm9tYW50aWMgZGlubmVyIHBhcnR5LiA8QlI+PEJSPlZpZXcgbW9yZSBpbnF1aXJ5IGRldGFpbHMgb24geW91ciBFdmVudCBEYXNoYm9hcmQuIElmIHlvdSBsaWtlIHdoYXQgeW91IHNlZSwgcGxlYXNlIGNyZWF0ZSBhIHByb3Bvc2FsIGZvciB0aGUgZXZlbnQuIDxCUj48QlI+SWYgeW91IGRvIG5vdCBoYXZlIHRoZSB0aW1lIHRvIG1ha2UgYSBmdWxsIHByb3Bvc2FsIHJpZ2h0IG5vdywgd2UgZW5jb3VyYWdlIHlvdSB0byBhdCBsZWFzdCByZXNwb25kIHRvIHRoZSBob3N0IHdpdGggYSBxdWljayBtZXNzYWdlIHRvIGNvbmZpcm0gdGhhdCB5b3UndmUgZ290dGVuIHRoaXMgaW5xdWlyeSBhbmQgaGF2ZSBiZWd1biB0aGlua2luZyBhYm91dCB0aGUgZXZlbnQuIDxCUj48QlI+PFNUUk9ORz5Zb3UgY2FuIHJlcGx5IGRpcmVjdGx5IHRvIHRoaXMgZW1haWwgYW5kIHlvdXIgbWVzc2FnZSB3aWxsIGdvIHRvIHRoZSBob3N0IG9uIHRoZSBldmVudCBkYXNoYm9hcmQuPC9TVFJPTkc+IDxCUj48QlI+UmVtZW1iZXIsIHlvdSBoYXZlIGV4Y2x1c2l2ZSBhY2Nlc3MgdG8gdGhpcyBpbnF1aXJ5IGZvciB0aGUgbmV4dCAyNCBob3Vycy4gUGxlYXNlIG1ha2UgYSBwcm9wb3NhbCBvciBzZW5kIGEgbWVzc2FnZSB0byB0aGUgaG9zdCBpbiB0aGF0IHRpbWUuIElmIHRoZSBob3N0IGhhcyBub3QgaGVhcmQgYW55dGhpbmcgZnJvbSB5b3UgaW4gMjQgaG91cnMsIHdlIHdpbGwKIGZvcndhcmQgdGhlIGhvc3RzIGlucXVpcnkgdG8gYSBzbWFsbCBudW1iZXIgb2YgYWRkaXRpb25hbCBjaGVmcywgYW5kIHRoZXkgd2lsbCBoYXZlIHRoZSBvcHBvcnR1bml0eSB0byBtYWtlIGEgcHJvcG9zYWwuIFdlIGRvIHRoaXMgYXMgYSBjb3VydGVzeSB0byB0aGUgaG9zdHMuIDxCUj48QlI+SWYgeW91IGNhbm5vdCBhY2NlcHQgdGhpcyBib29raW5nIG9yIGRvIG5vdCB3YW50IHRvIGZvciBhbnkgcmVhc29uLCBwbGVhc2UgdGFrZSB0aGUgdGltZSB0byBkZWNsaW5lIG9uIHRoZSBFdmVudCBEYXNoYm9hcmQuIDxCUj48QlI+VGltZSB0byBnZXQgY29va2luJyA8QlI+PEJSPjwvRElWPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48L1REPjwvVFI+CjxUUj4KPFREPgo8VEFCTEU+CjxUQk9EWT4KPFRSPgo8VEQ+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPERJVj4mbmJzcDs8QSBocmVmPSJodHRwOi8vZXhhbXBsZS5jb20iPmZvbGxvdyBvbiBUd2l0dGVyPC9BPiB8IDxBIGhyZWY9Imh0dHA6Ly94eHgiPmZyaWVuZCBvbiBGYWNlYm9vazwvQT4gfCA8QQogaHJlZj0iaHR0cDovL2V4YW1wbGUuY29tIj5Gb3J3YXJkIHRvIGEgRnJpZW5kPC9BPiZndDsmbmJzcDsgPC9ESVY+PC9URD48L1RSPgo8VFI+CjxURD4KPERJVj48RU0+Q29weXJpZ2g8L0VNPiA8L0RJVj48L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48QlI+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48L0NFTlRFUj48SU1HIGFsdD0iIiBzcmM9Imh0dHA6Ly9leGFtcGxlLmNvbSI+IDwvRElWPjwvRElWPjwvQkxPQ0tRVU9URT48L3RkPjwvdHI+PC90YWJsZT4K
+
+--===============6853056845739363347==--
diff --git a/mails/reply-separated-by-hr.html b/mails/reply-separated-by-hr.html
new file mode 100644
index 0000000..0d6eabb
--- /dev/null
+++ b/mails/reply-separated-by-hr.html
@@ -0,0 +1,21 @@
+<html>
+  <body>
+    <div>
+      Hi
+      <div>
+	there
+      </div>
+      <div>
+	Bob
+	<hr>
+	<b>From: </b>bob@example.com<br>
+	<b>To: </b>xxx@comcast.net<br>
+	<b>Sent: </b>Friday, July 22, 2011 6:20:01 PM<br>
+	<b>Subject: </b>Hello<br><br>
+	<p>
+	  Hello
+	</p>
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/mails/standard_replies/android.eml b/mails/standard_replies/android.eml
new file mode 100644
index 0000000..bf7cb33
--- /dev/null
+++ b/mails/standard_replies/android.eml
@@ -0,0 +1,24 @@
+Content-Type: multipart/alternative;
+ boundary="===============0934372227844987316=="
+MIME-Version: 1.0
+Date: Mon, 2 Apr 2012 18:22:10 +0400
+Message-Id: <CAEAsyCZ-sCHxZtoKyM3JmT5gSYpZd5GwY-cVNiV8H329zgJT4g@mail.gmail.com>
+Subject: Re: Test
+From: Sergey Obykhov <bob@example.com>
+To: "bob@xxx.mailgun.org" <bob@xxx.mailgun.org>
+
+--===============0934372227844987316==
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+SGVsbG8KMDIuMDQuMjAxMiAxNDoyMCDQv9C+0LvRjNC30L7QstCw0YLQtdC70YwgImJvYkB4eHgubWFpbGd1bi5vcmciIDwKYm9iQHh4eC5tYWlsZ3VuLm9yZz4g0L3QsNC/0LjRgdCw0Ls6Cgo+IEhpCj4KCg==
+
+--===============0934372227844987316==
+MIME-Version: 1.0
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+PHA+SGVsbG88L3A+CjxkaXYgY2xhc3M9ImdtYWlsX3F1b3RlIj4wMi4wNC4yMDEyIDE0OjIwINC/0L7Qu9GM0LfQvtCy0LDRgtC10LvRjCAmcXVvdDs8YSBocmVmPSJtYWlsdG86Ym9iQHh4eC5tYWlsZ3VuLm9yZyI+Ym9iQHh4eC5tYWlsZ3VuLm9yZzwvYT4mcXVvdDsgJmx0OzxhIGhyZWY9Im1haWx0bzpib2JAeHh4Lm1haWxndW4ub3JnIj5ib2JAeHh4Lm1haWxndW4ub3JnPC9hPiZndDsg0L3QsNC/0LjRgdCw0Ls6PGJyIHR5cGU9ImF0dHJpYnV0aW9uIj4KPGJsb2NrcXVvdGUgY2xhc3M9ImdtYWlsX3F1b3RlIiBzdHlsZT0ibWFyZ2luOjAgMCAwIC44ZXg7Ym9yZGVyLWxlZnQ6MXB4ICNjY2Mgc29saWQ7cGFkZGluZy1sZWZ0OjFleCI+SGk8YnI+CjwvYmxvY2txdW90ZT48L2Rpdj4KCg==
+
+--===============0934372227844987316==--
diff --git a/mails/standard_replies/aol.eml b/mails/standard_replies/aol.eml
new file mode 100644
index 0000000..340d3c2
--- /dev/null
+++ b/mails/standard_replies/aol.eml
@@ -0,0 +1,65 @@
+Content-Type: multipart/alternative;
+ boundary="===============7429987408351918371=="
+MIME-Version: 1.0
+To: bob@example.com
+Subject: Re: Test
+From: Megan Odin <xxx@aol.com>
+Message-Id: <8CEDEEFBEF4733B-1E5C-73DF@webmail-d070.sysops.aol.com>
+Date: Mon, 2 Apr 2012 09:57:58 -0400 (EDT)
+
+--===============7429987408351918371==
+Content-Type: text/plain; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+
+Hello
+
+
+
+-----Original Message-----
+From: bob <bob@example.com>
+To: xxx <xxx@gmail.com>; xxx <xxx@hotmail.com>; xxx <xxx@yahoo.com>; xxx <xxx@aol.com>; xxx <xxx@comcast.net>; xxx <xxx@nyc.rr.com>
+Sent: Mon, Apr 2, 2012 5:49 pm
+Subject: Test
+
+
+Hi
+
+ 
+
+--===============7429987408351918371==
+Content-Type: text/html; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+
+<font color='black' size='2' face='arial'>Hello<br>
+
+<br>
+<br>
+
+<div style="font-family:arial,helvetica;font-size:10pt;color:black">-----Original Message-----<br>
+From: bob &lt;bob@example.com&gt;<br>
+To: xxx &lt;xxx@gmail.com&gt;; xxx &lt;xxx@hotmail.com&gt;; xxx &lt;xxx@yahoo.com&gt;; xxx &lt;xxx@aol.com&gt;; xxx &lt;xxx@comcast.net&gt;; xxx &lt;xxx@nyc.rr.com&gt;<br>
+Sent: Mon, Apr 2, 2012 5:49 pm<br>
+Subject: Test<br>
+
+<br>
+
+
+
+
+
+
+
+<div id="AOLMsgPart_0_4d68a632-fe65-4f6d-ace2-292ac1b91f1f" style="margin: 0px;font-family: Tahoma, Verdana, Arial, Sans-Serif;font-size: 12px;color: #000;background-color: #fff;">
+
+<pre style="font-size: 9pt;"><tt>Hi
+</tt></pre>
+</div>
+ <!-- end of AOLMsgPart_0_4d68a632-fe65-4f6d-ace2-292ac1b91f1f -->
+
+
+
+</div>
+</font>
+--===============7429987408351918371==--
diff --git a/mails/standard_replies/apple_mail.eml b/mails/standard_replies/apple_mail.eml
new file mode 100644
index 0000000..1adbc3b
--- /dev/null
+++ b/mails/standard_replies/apple_mail.eml
@@ -0,0 +1,15 @@
+Content-Type: text/plain; charset=iso-8859-1
+Mime-Version: 1.0 (Apple Message framework v1257)
+Subject: Re: Test
+From: xxx <xxx@gmail.com>
+Date: Tue, 3 Apr 2012 16:55:26 +0400
+Content-Transfer-Encoding: 7bit
+Message-Id: <9A1EA6A5-4FD3-4AD0-8DFD-2420E670DB53@gmail.com>
+To: bob <bob@example.com>
+X-Mailer: Apple Mail (2.1257)
+
+Hello
+
+On Apr 3, 2012, at 4:19 PM, bob wrote:
+
+> Hi
diff --git a/mails/standard_replies/apple_mail_2.eml b/mails/standard_replies/apple_mail_2.eml
new file mode 100644
index 0000000..a030311
--- /dev/null
+++ b/mails/standard_replies/apple_mail_2.eml
@@ -0,0 +1,19 @@
+Content-Type: text/plain;
+	charset=us-ascii
+Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
+Subject: Re: Hello there
+X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
+From: Adam Renberg <adam@tictail.com>
+In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+Date: Sat, 22 Aug 2015 19:22:20 +0200
+Content-Transfer-Encoding: 7bit
+X-Smtp-Server: smtp.gmail.com:adam@tictail.com
+Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
+References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+To: Adam Renberg <tgwizard@gmail.com>
+
+Hello
+> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
+>
+> Hi there!
+
diff --git a/mails/standard_replies/comcast.eml b/mails/standard_replies/comcast.eml
new file mode 100644
index 0000000..fe25d73
--- /dev/null
+++ b/mails/standard_replies/comcast.eml
@@ -0,0 +1,33 @@
+Content-Type: multipart/alternative;
+ boundary="===============3552566137977633461=="
+MIME-Version: 1.0
+Date: Mon, 2 Apr 2012 13:56:12 +0000 (UTC)
+From: xxx@comcast.net
+To: bob@xxx.mailgun.org
+Message-Id: <650787974.741595.1333374972389.JavaMail.root@sz0152a.westchester.pa.mail.comcast.net>
+Subject: Re: Test
+X-Mailer: Zimbra 6.0.13_GA_2944 (ZimbraWebClient - SAF3 (Linux)/6.0.13_GA_2944)
+
+--===============3552566137977633461==
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+Hello 
+
+----- Original Message -----
+From: bob@xxx.mailgun.org 
+To: xxx@gmail.com, xxx@hotmail.com, xxx@yahoo.com, xxx@aol.com, xxx@comcast.net, lsloan6@nyc.rr.com 
+Sent: Monday, April 2, 2012 5:44:22 PM 
+Subject: Test 
+
+Hi 
+
+--===============3552566137977633461==
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+<html><head><style type='text/css'>p { margin: 0; }</style></head><body><div style='font-family: Arial; font-size: 12pt; color: #000000'>Hello<br><br><hr id="zwchr"><b>From: </b>bob@xxx.mailgun.org<br><b>To: </b>xxx@gmail.com, xxx@hotmail.com, xxx@yahoo.com, xxx@aol.com, xxx@comcast.net, lsloan6@nyc.rr.com<br><b>Sent: </b>Monday, April 2, 2012 5:44:22 PM<br><b>Subject: </b>Test<br><br>Hi<br></div></body></html>
+--===============3552566137977633461==--
+
diff --git a/mails/standard_replies/gmail.eml b/mails/standard_replies/gmail.eml
new file mode 100644
index 0000000..99ece5b
--- /dev/null
+++ b/mails/standard_replies/gmail.eml
@@ -0,0 +1,31 @@
+Content-Type: multipart/alternative;
+ boundary="===============3455449757443551301=="
+MIME-Version: 1.0
+Date: Mon, 2 Apr 2012 20:21:52 +0400
+Message-Id: <CAKsfaBW4hj0Gek6TwbR3erng4P1y0CZzJ0d=pXtCNnYnbe7PLg@mail.gmail.com>
+Subject: Re: Test
+From: Megan One <xxx@gmail.com>
+To: bob@example.com
+
+--===============3455449757443551301==
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+Hello
+
+On Mon, Apr 2, 2012 at 6:26 PM, Megan One <xxx@gmail.com> wrote:
+
+> Hi
+
+--===============3455449757443551301==
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+Hello<br><br><div class="gmail_quote">On Mon, Apr 2, 2012 at 6:26 PM, Megan One <span dir="ltr">&lt;<a href="mailto:xxx@gmail.com">xxx@gmail.com</a>&gt;</span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+Hi
+
+</blockquote></div><br>
+
+--===============3455449757443551301==--
diff --git a/mails/standard_replies/hotmail.eml b/mails/standard_replies/hotmail.eml
new file mode 100644
index 0000000..636f218
--- /dev/null
+++ b/mails/standard_replies/hotmail.eml
@@ -0,0 +1,50 @@
+Content-Type: multipart/alternative;
+ boundary="===============5499446768842282638=="
+MIME-Version: 1.0
+Message-Id: <DUB102-W192C6E94759954C4885B92B14C0@phx.gbl>
+From: Alexey Q <xxx@hotmail.com>
+To: <bob@xxx.mailgun.org>
+Subject: RE: Test
+Date: Mon, 2 Apr 2012 21:47:37 +0800
+X-Originalarrivaltime: 02 Apr 2012 13:47:37.0935 (UTC)
+ FILETIME=[2A6C0DF0:01CD10D7]
+
+--===============5499446768842282638==
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+
+Hello
+
+> Subject: Test
+> From: bob@xxx.mailgun.org
+> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+> Date: Mon, 2 Apr 2012 17:44:22 +0400
+> 
+> Hi
+ 		 	   		  
+--===============5499446768842282638==
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+
+<html>
+<head>
+<style><!--
+.hmmessage P
+{
+margin:0px;
+padding:0px
+}
+body.hmmessage
+{
+font-size: 10pt;
+font-family:Tahoma
+}
+--></style></head>
+<body class='hmmessage'><div dir='ltr'>
+Hello<br><br><div><div id="SkyDrivePlaceholder"></div>&gt; Subject: Test<br>&gt; From: bob@xxx.mailgun.org<br>&gt; To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com<br>&gt; Date: Mon, 2 Apr 2012 17:44:22 +0400<br>&gt; <br>&gt; Hi<br></div> 		 	   		  </div></body>
+
+</html>
+--===============5499446768842282638==--
diff --git a/mails/standard_replies/iphone.eml b/mails/standard_replies/iphone.eml
new file mode 100644
index 0000000..320f8ac
--- /dev/null
+++ b/mails/standard_replies/iphone.eml
@@ -0,0 +1,19 @@
+Subject: Re: Test
+From: xxx <xxx@gmail.com>
+Content-Type: text/plain;
+	charset=us-ascii
+X-Mailer: iPhone Mail (9B176)
+Message-Id: <06C90B12-13B9-4C5F-A9EF-4A809D94C078@gmail.com>
+Date: Tue, 3 Apr 2012 16:23:59 +0400
+To: bob <bob@example.com>
+Content-Transfer-Encoding: quoted-printable
+Mime-Version: 1.0 (1.0)
+
+Hello
+
+Sent from my iPhone
+
+On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
+ote:
+
+> Hi
diff --git a/mails/standard_replies/iphone_reply_text b/mails/standard_replies/iphone_reply_text
new file mode 100644
index 0000000..460d6d7
--- /dev/null
+++ b/mails/standard_replies/iphone_reply_text
@@ -0,0 +1,3 @@
+Hello
+
+Sent from my iPhone
diff --git a/mails/standard_replies/outlook.eml b/mails/standard_replies/outlook.eml
new file mode 100644
index 0000000..674828f
--- /dev/null
+++ b/mails/standard_replies/outlook.eml
@@ -0,0 +1,85 @@
+Subject: Test
+From: me@example.com
+To: you@example.com
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary=0016364c440b2e8b63049acd5370
+X-Mailgun-Tag: tag
+X-Mailgun-Mailing-List-Id: 1q
+
+--0016364c440b2e8b63049acd5370
+Content-Type: text/plain; charset=ISO-8859-1
+
+Hello
+
+From: xxx@xxx.mailgun.org [mailto:xxx@xxx.mailgun.org]
+Sent: March-09-12 4:22 PM
+To: Dan Le
+Subject: The manager has commented on your Loop
+
+Hi dan.le@example.com<mailto:dan.le@example.com>,
+
+The manager's comment:
+"Hello Allan! Did you ask for some MIME? "
+
+Loop details:
+
+xxx at Dan
+I'm not happy
+""
+
+Your Loop is here<http://dev.xxx.com/loop/view/4f50f20e160839c95a000bb3?_uid=4f3541a7ac63e655040008e3>.
+
+We will be in touch again with any further updates,
+
+xxx
+
+If you did not sign up to receive emails from us you can use the link below to unsubscribe. We apologize for any inconvenience.
+
+Unsubscribe<http://dev.xxx.com/user/unsubscribe/dan.le@example.com?verify=4a400554148256338956101abdf06406>
+
+--0016364c440b2e8b63049acd5370
+Content-Type: text/html; charset=ISO-8859-1
+
+<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 14 (filtered medium)"><style><!--
+/* Font Definitions */
+@font-face
+	{font-family:Calibri;
+	panose-1:2 15 5 2 2 2 4 3 2 4;}
+@font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0cm;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+a:link, span.MsoHyperlink
+	{mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;}
+span.EmailStyle17
+	{mso-style-type:personal-reply;
+	font-family:"Calibri","sans-serif";
+	color:#1F497D;}
+.MsoChpDefault
+	{mso-style-type:export-only;
+	font-family:"Calibri","sans-serif";
+	mso-fareast-language:EN-US;}
+@page WordSection1
+	{size:612.0pt 792.0pt;
+	margin:72.0pt 72.0pt 72.0pt 72.0pt;}
+div.WordSection1
+	{page:WordSection1;}
+--></style><!--[if gte mso 9]><xml>
+<o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<o:shapelayout v:ext="edit">
+<o:idmap v:ext="edit" data="1" />
+</o:shapelayout></xml><![endif]--></head><body lang=EN-CA link=blue vlink=purple><div class=WordSection1><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Allo! Follow up MIME!<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><o:p>&nbsp;</o:p></span></p><p class=MsoNormal><b><span lang=EN-US style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span lang=EN-US style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'> xxx@xxx.mailgun.org [mailto:xxx@xxx.mailgun.org] <br><b>Sent:</b> March-09-12 4:22 PM<br><b>To:</b> Dan Le<br><b>Subject:</b> The manager has commented on your Loop<o:p></o:p></span></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Hi <a href="mailto:dan.le@example.com">dan.le@example.com</a>,<br><br>The manager's comment:<br>&quot;Hello Allan! Did you ask for some MIME? &quot;<br><br>Loop details:<br><br>xxx at Dan<br>I'm not happy<br>&quot;&quot;<br><br>Your Loop is <a href="http://dev.xxx.com/loop/view/4f50f20e160839c95a000bb3?_uid=4f3541a7ac63e655040008e3">here</a>.<br><br>We will be in touch again with any further updates,<br><br>xxx<br><br>If you did not sign up to receive emails from us you can use the link below to unsubscribe. We apologize for any inconvenience.<br><br><a href="http://dev.xxx.com/user/unsubscribe/dan.le@example.com?verify=4a400554148256338956101abdf06406">Unsubscribe</a> <o:p></o:p></p></div></body></html>
+
+--0016364c440b2e8b63049acd5370--
\ No newline at end of file
diff --git a/mails/standard_replies/sparrow.eml b/mails/standard_replies/sparrow.eml
new file mode 100644
index 0000000..b2a510d
--- /dev/null
+++ b/mails/standard_replies/sparrow.eml
@@ -0,0 +1,61 @@
+Date: Tue, 3 Apr 2012 16:58:35 +0400
+From: xxx <xxx@gmail.com>
+To: bob <bob@example.com>
+Message-ID: <5BB86EF4B6E24E4C9DA4BBEF59DA9809@gmail.com>
+Subject: Re: Test
+X-Mailer: sparrow 1.5 (build 1043)
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="4f7af3fb_749abb43_300"
+
+--4f7af3fb_749abb43_300
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: inline
+
+Hello 
+
+-- 
+xxx
+Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
+
+
+On Tuesday, April 3, 2012 at 4:55 PM, xxx wrote:
+
+> Hello
+> 
+> On Apr 3, 2012, at 4:19 PM, bob wrote:
+> 
+> > Hi 
+
+
+--4f7af3fb_749abb43_300
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: quoted-printable
+Content-Disposition: inline
+
+
+                <div>
+                    <span style=3D=22font-size: 12px;=22>Hello</span>
+                </div>
+                <div><div><br></div><div>--&nbsp;</div><div>xx=
+x</div><div>Sent with <a href=3D=22http://www.sparrowmailapp.com/=3Fsig=22=
+>Sparrow</a></div><div><br></div></div>
+                =20
+                <p style=3D=22color: =23A0A0A8;=22>On Tuesday, April 3, 2=
+012 at 4:55 PM, xxx wrote:</p>
+                <blockquote type=3D=22cite=22 style=3D=22border-left-styl=
+e:solid;border-width:1px;margin-left:0px;padding-left:10px;=22>
+                    <span><div><div><div>Hello</div><div><br></div><div>O=
+n Apr 3, 2012, at 4:19 PM, bob wrote:</div><div><br></div><blo=
+ckquote type=3D=22cite=22><div>Hi</div></blockquote></div></div></span>
+                =20
+                =20
+                =20
+                =20
+                </blockquote>
+                =20
+                <div>
+                    <br>
+                </div>
+            
+--4f7af3fb_749abb43_300--
diff --git a/mails/standard_replies/sparrow_reply_text b/mails/standard_replies/sparrow_reply_text
new file mode 100644
index 0000000..0a8f078
--- /dev/null
+++ b/mails/standard_replies/sparrow_reply_text
@@ -0,0 +1,5 @@
+Hello 
+
+-- 
+xxx
+Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
\ No newline at end of file
diff --git a/mails/standard_replies/thunderbird.eml b/mails/standard_replies/thunderbird.eml
new file mode 100644
index 0000000..e74e69d
--- /dev/null
+++ b/mails/standard_replies/thunderbird.eml
@@ -0,0 +1,15 @@
+MIME-Version: 1.0
+Message-Id: <4F79B73C.9030506@xxx.mailgun.org>
+Date: Mon, 02 Apr 2012 18:27:08 +0400
+From: bob <bob@xxx.mailgun.org>
+User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US;
+ rv:1.9.2.28) Gecko/20120313 Thunderbird/3.1.20
+To: Megan One <xxx@gmail.com>
+Subject: Re: Test
+Sender: bob@xxx.mailgun.org
+Content-Type: text/plain; charset="us-ascii"; format="flowed"
+Content-Transfer-Encoding: 7bit
+
+On 04/02/2012 06:26 PM, Megan One wrote:
+> Hi 
+Hello
\ No newline at end of file
diff --git a/mails/standard_replies/yahoo.eml b/mails/standard_replies/yahoo.eml
new file mode 100644
index 0000000..4969255
--- /dev/null
+++ b/mails/standard_replies/yahoo.eml
@@ -0,0 +1,22 @@
+Content-Type: text/plain; charset="us-ascii"
+MIME-Version: 1.0
+X-Mailer: YahooMailWebService/0.8.117.340979
+Message-Id: <1333374330.68772.YahooMailNeo@web114411.mail.gq1.yahoo.com>
+Date: Mon, 2 Apr 2012 06:45:30 -0700 (PDT)
+From: Alex Q <xxx@yahoo.com>
+Subject: Re: Test
+To: "bob@xxx.mailgun.org" <bob@xxx.mailgun.org>
+In-Reply-To: <1333374262.7063.15.camel@mg5>
+Content-Transfer-Encoding: 7bit
+
+Hello
+
+
+----- Original Message -----
+From: "bob@xxx.mailgun.org" <bob@xxx.mailgun.org>
+To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+Cc: 
+Sent: Monday, April 2, 2012 5:44 PM
+Subject: Test
+
+Hi
diff --git a/test.py b/test.py
index c6a1c7d..703e349 100644
--- a/test.py
+++ b/test.py
@@ -1,9 +1,16 @@
 import json
 import os
+import re
 import unittest
+from unittest.mock import Mock, patch
 
+from extract_raw_content import constants, html, text, utils
 from mail_parser import serialize_mail
 
+STANDARD_REPLIES = "mails/standard_replies"
+RE_WHITESPACE = re.compile(r"\s")
+RE_DOUBLE_WHITESPACE = re.compile(r"\s")
+
 
 def get_email_as_bytes(name):
     with open(
@@ -42,6 +49,1222 @@ def test_html_only(self):
         self.assertTrue(manifest["text"]["content"])
         self.assertTrue(manifest["text"]["html_content"])
 
+    def test_get_delimiter(self):
+        self.assertEqual("\r\n", text.get_delimiter("abc\r\n123"))
+        self.assertEqual("\n", text.get_delimiter("abc\n123"))
+        self.assertEqual("\n", text.get_delimiter("abc"))
+
+    def test_html_to_text(self):
+        html = """<body>
+<p>Hello world!</p>
+<br>
+<ul>
+<li>One!</li>
+<li>Two</li>
+</ul>
+<p>
+Haha
+</p>
+</body>"""
+        text = utils.html_to_text(html)
+        self.assertEqual("Hello world! \n\n  * One! \n  * Two \nHaha", text)
+        self.assertEqual("привет!", utils.html_to_text("<b>привет!</b>"))
+
+        html = "<body><br/><br/>Hi</body>"
+        self.assertEqual("Hi", utils.html_to_text(html))
+
+        html = """Hi
+<style type="text/css">
+
+div, p, li {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>
+
+<style type="text/css">
+
+h1 {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>"""
+        self.assertEqual("Hi", utils.html_to_text(html))
+
+        html = """<div>
+<!-- COMMENT 1 -->
+<span>TEXT 1</span>
+<p>TEXT 2 <!-- COMMENT 2 --></p>
+</div>"""
+        self.assertEqual("TEXT 1 \nTEXT 2", utils.html_to_text(html))
+
+    def test_comment_no_parent(self):
+        s = "<!-- COMMENT 1 --> no comment"
+        d = html.html_document_fromstring(s)
+        self.assertEqual("no comment", utils.html_tree_to_text(d))
+
+    @patch.object(utils, "html_fromstring", Mock(return_value=None))
+    def test_bad_html_to_text(self):
+        bad_html = "one<br>two<br>three"
+        self.assertEqual(None, utils.html_to_text(bad_html))
+
+    def test_quotation_splitter_inside_blockquote(self):
+        msg_body = """Reply
+<blockquote>
+
+<div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<div>
+    Test
+</div>
+
+</blockquote>"""
+
+        self.assertEqual(
+            "<html><head></head><body>Reply</body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_quotation_splitter_outside_blockquote(self):
+        msg_body = """Reply
+
+<div>
+On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<blockquote>
+<div>
+    Test
+</div>
+</blockquote>
+"""
+        self.assertEqual(
+            "<html><head></head><body>Reply</body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_regular_blockquote(self):
+        msg_body = """Reply
+<blockquote>Regular</blockquote>
+
+<div>
+On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<blockquote>
+<div>
+    <blockquote>Nested</blockquote>
+</div>
+</blockquote>
+"""
+        self.assertEqual(
+            "<html><head></head><body>Reply"
+            + "<blockquote>Regular</blockquote></body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_no_blockquote(self):
+        msg_body = """
+<html>
+<body>
+Reply
+
+<div>
+On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<div>
+Test
+</div>
+</body>
+</html>
+"""
+
+        reply = """
+<html>
+<head></head>
+<body>
+Reply
+
+</body></html>"""
+        self.assertEqual(
+            RE_WHITESPACE.sub("", reply),
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_empty_body(self):
+        self.assertEqual("", html.extract_from_html(""))
+
+    def test_validate_output_html(self):
+        msg_body = """Reply
+<div>
+On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+
+    <blockquote>
+    <div>
+        Test
+    </div>
+    </blockquote>
+</div>
+
+<div/>
+"""
+        out = html.extract_from_html(msg_body)
+        self.assertTrue(
+            "<html>" in out and "</html>" in out,
+            "Invalid HTML - <html>/</html> tag not present",
+        )
+        self.assertTrue(
+            "<div/>" not in out, "Invalid HTML output - <div/> element is not valid"
+        )
+
+    def test_gmail_quote(self):
+        msg_body = """Reply
+<div class="gmail_quote">
+<div class="gmail_quote">
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+    <div>
+    Test
+    </div>
+</div>
+</div>"""
+        self.assertEqual(
+            "<html><head></head><body>Reply</body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_gmail_quote_compact(self):
+        msg_body = (
+            "Reply"
+            '<div class="gmail_quote">'
+            '<div class="gmail_quote">'
+            + "On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:"
+            "<div>Test</div>"
+            "</div>"
+            "</div>"
+        )
+        self.assertEqual(
+            "<html><head></head><body>Reply</body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_gmail_quote_blockquote(self):
+        msg_body = """Message
+<blockquote class="gmail_quote">
+<div class="gmail_default">
+    My name is William Shakespeare.
+    <br/>
+</div>
+</blockquote>"""
+        self.assertEqual(
+            RE_WHITESPACE.sub("", msg_body),
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_blockquote_disclaimer(self):
+        msg_body = """
+<html>
+<body>
+<div>
+    <div>
+    message
+    </div>
+    <blockquote>
+    Quote
+    </blockquote>
+</div>
+<div>
+    disclaimer
+</div>
+</body>
+</html>
+"""
+
+        stripped_html = """
+<html>
+<head></head>
+<body>
+<div>
+    <div>
+    message
+    </div>
+</div>
+<div>
+    disclaimer
+</div>
+</body>
+</html>
+"""
+        self.assertEqual(
+            RE_WHITESPACE.sub("", stripped_html),
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_date_block(self):
+        msg_body = """
+<div>
+message<br>
+<div>
+    <hr>
+    Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
+    To: <a href="mailto:bob@example.com">bob@example.com</a><br>
+    From: <a href="mailto:rob@example.com">rob@example.com</a><br>
+    Subject: You Have New Mail From Mary!<br><br>
+
+    text
+</div>
+</div>
+"""
+        self.assertEqual(
+            "<html><head></head><body><div>message<br></div></body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_from_block(self):
+        msg_body = """<div>
+message<br>
+<div>
+<hr>
+From: <a href="mailto:bob@example.com">bob@example.com</a><br>
+Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
+To: <a href="mailto:rob@example.com">rob@example.com</a><br>
+Subject: You Have New Mail From Mary!<br><br>
+
+text
+</div></div>
+"""
+        self.assertEqual(
+            "<html><head></head><body><div>message<br></div></body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_reply_shares_div_with_from_block(self):
+        msg_body = """
+<body>
+<div>
+
+    Blah<br><br>
+
+    <hr>Date: Tue, 22 May 2012 18:29:16 -0600<br>
+    To: xx@hotmail.ca<br>
+    From: quickemail@ashleymadison.com<br>
+    Subject: You Have New Mail From x!<br><br>
+
+</div>
+</body>"""
+        self.assertEqual(
+            "<html><head></head><body><div>Blah<br><br></div></body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def test_reply_quotations_share_block(self):
+        stripped_html = text.extract_from_plain(
+            get_email_as_bytes("reply-quotations-share-block.eml").decode("utf-8")
+        )
+        self.assertTrue(stripped_html)
+        self.assertTrue("From" not in stripped_html)
+
+    def test_OLK_SRC_BODY_SECTION_stripped(self):
+        self.assertEqual(
+            "<html><head></head><body><div>Reply</div></body></html>",
+            RE_WHITESPACE.sub(
+                "",
+                html.extract_from_html(get_email_as_bytes("OLK_SRC_BODY_SECTION.html")),
+            ),
+        )
+
+    def test_reply_separated_by_hr(self):
+        self.assertEqual(
+            "<html><head></head><body><div>Hi<div>there</div></div></body></html>",
+            RE_WHITESPACE.sub(
+                "",
+                html.extract_from_html(
+                    get_email_as_bytes("reply-separated-by-hr.html")
+                ),
+            ),
+        )
+
+    def test_from_block_and_quotations_in_separate_divs(self):
+        msg_body = """
+Reply
+<div>
+<hr/>
+<div>
+    <font>
+    <b>From: bob@example.com</b>
+    <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
+    </font>
+</div>
+<div>
+    Quoted message
+</div>
+</div>
+"""
+        self.assertEqual(
+            "<html><head></head><body>Reply<div><hr></div></body></html>",
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    def extract_reply_and_check(self, filename):
+        kwargs = {}
+        kwargs["encoding"] = "utf8"
+
+        with open(filename, **kwargs) as f:
+            msg_body = f.read()
+            reply = html.extract_from_html(msg_body)
+            plain_reply = utils.html_to_text(reply)
+
+            self.assertEqual(
+                RE_WHITESPACE.sub("", "Hi. I am fine.\n\nThanks,\nAlex"),
+                RE_WHITESPACE.sub("", plain_reply),
+            )
+
+    def test_CRLF(self):
+        """CR is not converted to '&#13;'"""
+        symbol = "&#13;"
+        extracted = html.extract_from_html("<html>\r\n</html>")
+        self.assertFalse(symbol in extracted)
+        self.assertEqual("<html></html>", RE_WHITESPACE.sub("", extracted))
+
+        msg_body = """My
+reply
+<blockquote>
+
+<div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<div>
+    Test
+</div>
+
+</blockquote>"""
+        msg_body = msg_body.replace("\n", "\r\n")
+        extracted = html.extract_from_html(msg_body)
+        self.assertFalse(symbol in extracted)
+        # Keep new lines otherwise "My reply" becomes one word - "Myreply"
+        self.assertEqual(
+            "<html><head></head><body>My\nreply\n</body></html>", extracted
+        )
+
+    def test_gmail_forwarded_msg(self):
+        msg_body = (
+            '<div dir="ltr"><br>'
+            + '<div class="gmail_quote">---------- Forwarded message ----------<br>'
+            + 'From: <b class="gmail_sendername">Bob</b> <span dir="ltr">'
+            + '&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;'
+            + "</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>"
+            + "Subject: Bob WFH today<br>To: Mary &lt;"
+            + '<a href="mailto:mary@example.com">'
+            + 'mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>'
+            + "</div><br></div>"
+        )
+        extracted = html.extract_from_html(msg_body)
+        self.assertEqual(
+            RE_WHITESPACE.sub("", msg_body), RE_WHITESPACE.sub("", extracted)
+        )
+
+    def test_readable_html_empty(self):
+        msg_body = """
+<blockquote>
+Reply
+<div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<div>
+    Test
+</div>
+
+</blockquote>"""
+
+        self.assertEqual(
+            RE_WHITESPACE.sub("", msg_body),
+            RE_WHITESPACE.sub("", html.extract_from_html(msg_body)),
+        )
+
+    @patch.object(html, "html_document_fromstring", Mock(return_value=None))
+    def test_bad_html(self):
+        bad_html = "<html></html>"
+        self.assertEqual(bad_html, html.extract_from_html(bad_html))
+
+    def test_gmail_reply(self):
+        self.extract_reply_and_check("mails/html_replies/gmail.html")
+
+    def test_mail_ru_reply(self):
+        self.extract_reply_and_check("mails/html_replies/mail_ru.html")
+
+    def test_hotmail_reply(self):
+        self.extract_reply_and_check("mails/html_replies/hotmail.html")
+
+    def test_ms_outlook_2003_reply(self):
+        self.extract_reply_and_check("mails/html_replies/ms_outlook_2003.html")
+
+    def test_ms_outlook_2007_reply(self):
+        self.extract_reply_and_check("mails/html_replies/ms_outlook_2007.html")
+
+    def test_ms_outlook_2010_reply(self):
+        self.extract_reply_and_check("mails/html_replies/ms_outlook_2010.html")
+
+    def test_thunderbird_reply(self):
+        self.extract_reply_and_check("mails/html_replies/thunderbird.html")
+
+    def test_windows_mail_reply(self):
+        self.extract_reply_and_check("mails/html_replies/windows_mail.html")
+
+    def test_yandex_ru_reply(self):
+        self.extract_reply_and_check("mails/html_replies/yandex_ru.html")
+
+    @patch.object(constants, "MAX_LINES_COUNT", 1)
+    def test_too_many_lines(self):
+        msg_body = """Test reply
+Hi
+-----Original Message-----
+
+Test"""
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_pattern_on_date_somebody_wrote(self):
+        msg_body = """Test reply
+
+On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
+
+>
+> Test
+>
+> Roman"""
+
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_pattern_on_date_polymail(self):
+        msg_body = """Test reply
+
+On Tue, Apr 11, 2017 at 10:07 PM John Smith
+
+<
+mailto:John Smith <johnsmith@gmail.com>
+> wrote:
+Test quoted data
+"""
+
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_pattern_sent_from_samsung_smb_wrote(self):
+        msg_body = """Test reply
+
+Sent from Samsung MobileName <address@example.com> wrote:
+
+>
+> Test
+>
+> Roman"""
+
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_pattern_on_date_wrote_somebody(self):
+        self.assertEqual(
+            "Lorem",
+            text.extract_from_plain(
+                """Lorem
+
+Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
+
+Veniam laborum mlkshk kale chips authentic.
+Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up
+freegan enim master cleanse.
+"""
+            ),
+        )
+
+    def test_pattern_on_date_somebody_wrote_date_with_slashes(self):
+        msg_body = """Test reply
+
+On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
+
+>
+> Test.
+>
+> Roman"""
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_date_time_email_splitter(self):
+        msg_body = """Test reply
+
+2014-10-17 11:28 GMT+03:00 Postmaster <
+postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
+
+> First from site
+>
+    """
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_pattern_on_date_somebody_wrote_allows_space_in_front(self):
+        msg_body = """Thanks Thanmai
+On Mar 8, 2012 9:59 AM, "Example.com" <
+r+7f1b094ceb90e18cca93d53d3703feae@example.com> wrote:
+
+
+>**
+>  Blah-blah-blah"""
+        self.assertEqual("Thanks Thanmai", text.extract_from_plain(msg_body))
+
+    def test_pattern_on_date_somebody_sent(self):
+        msg_body = """Test reply
+
+On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent:
+
+>
+> Test
+>
+> Roman"""
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_appointment(self):
+        msg_body = """Response
+
+    10/19/2017 @ 9:30 am for physical therapy
+    Bla
+    1517 4th Avenue Ste 300
+    London CA 19129, 555-421-6780
+
+    John Doe, FCLS
+    Mailgun Inc
+    555-941-0697
+
+    From: from@example.com [mailto:from@example.com]
+    Sent: Wednesday, October 18, 2017 2:05 PM
+    To: John Doer - SIU <jd@example.com>
+    Subject: RE: Claim # 5551188-1
+
+    Text"""
+
+        expected = """Response
+
+    10/19/2017 @ 9:30 am for physical therapy
+    Bla
+    1517 4th Avenue Ste 300
+    London CA 19129, 555-421-6780
+
+    John Doe, FCLS
+    Mailgun Inc
+    555-941-0697"""
+        self.assertEqual(expected, text.extract_from_plain(msg_body))
+
+    def test_line_starts_with_on(self):
+        msg_body = """Blah-blah-blah
+On blah-blah-blah"""
+        self.assertEqual(msg_body, text.extract_from_plain(msg_body))
+
+    def test_reply_and_quotation_splitter_share_line(self):
+        # reply lines and 'On <date> <person> wrote:' splitter pattern
+        # are on the same line
+        msg_body = """reply On Wed, Apr 4, 2012 at 3:59 PM, bob@example.com wrote:
+> Hi"""
+        self.assertEqual("reply", text.extract_from_plain(msg_body))
+
+        # test pattern '--- On <date> <person> wrote:' with reply text on
+        # the same line
+        msg_body = """reply--- On Wed, Apr 4, 2012 at 3:59 PM, me@domain.com wrote:
+> Hi"""
+        self.assertEqual("reply", text.extract_from_plain(msg_body))
+
+        # test pattern '--- On <date> <person> wrote:' with reply text containing
+        # '-' symbol
+        msg_body = """reply
+bla-bla - bla--- On Wed, Apr 4, 2012 at 3:59 PM, me@domain.com wrote:
+> Hi"""
+        reply = """reply
+bla-bla - bla"""
+
+        self.assertEqual(reply, text.extract_from_plain(msg_body))
+
+    def test_android_wrote(self):
+        msg_body = """Test reply
+
+---- John Smith wrote ----
+
+> quoted
+> text
+"""
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_reply_wraps_quotations(self):
+        msg_body = """Test reply
+
+On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
+
+>
+> Test
+
+Regards, Roman"""
+
+        reply = """Test reply
+
+Regards, Roman"""
+
+        self.assertEqual(reply, text.extract_from_plain(msg_body))
+
+    def test_reply_wraps_nested_quotations(self):
+        msg_body = """Test reply
+On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
+
+>Test test
+>On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
+>
+>>
+>> Test.
+>>
+>> Roman
+
+Regards, Roman"""
+
+        reply = """Test reply
+Regards, Roman"""
+        self.assertEqual(reply, text.extract_from_plain(msg_body))
+
+    def test_quotation_separator_takes_2_lines(self):
+        msg_body = """Test reply
+
+On Fri, May 6, 2011 at 6:03 PM, Roman Tkachenko from Hacker News
+<roman@definebox.com> wrote:
+
+> Test.
+>
+> Roman
+
+Regards, Roman"""
+
+        reply = """Test reply
+
+Regards, Roman"""
+        self.assertEqual(reply, text.extract_from_plain(msg_body))
+
+    def test_quotation_separator_takes_3_lines(self):
+        msg_body = """Test reply
+
+On Nov 30, 2011, at 12:47 PM, Somebody <
+416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4@somebody.domain.com>
+wrote:
+
+Test message
+"""
+        self.assertEqual("Test reply", text.extract_from_plain(msg_body))
+
+    def test_short_quotation(self):
+        msg_body = """Hi
+
+On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
+
+> Hello"""
+        self.assertEqual("Hi", text.extract_from_plain(msg_body))
+
+    def test_with_indent(self):
+        msg_body = """
+YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
+
+------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
+
+Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
+    """
+        self.assertEqual(
+            "YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.",
+            text.extract_from_plain(msg_body),
+        )
+
+    def test_short_quotation_with_newline(self):
+        msg_body = """Btw blah blah...
+
+On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
+
+Hi Mark,
+Blah blah? 
+Thanks,Christine 
+
+On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
+
+Lorem ipsum?
+Mark
+
+Sent from Acompli"""
+        self.assertEqual("Btw blah blah...", text.extract_from_plain(msg_body))
+
+    def test_pattern_date_email_with_unicode(self):
+        msg_body = """Replying ok
+2011/4/7 Nathan \xd0\xb8ova <support@example.com>
+
+>  Cool beans, scro"""
+        self.assertEqual("Replying ok", text.extract_from_plain(msg_body))
+
+    def test_english_from_block(self):
+        self.assertEqual(
+            "Allo! Follow up MIME!",
+            text.extract_from_plain(
+                """Allo! Follow up MIME!
+
+From: somebody@example.com
+Sent: March-19-11 5:42 PM
+To: Somebody
+Subject: The manager has commented on your Loop
+
+Blah-blah-blah
+"""
+            ),
+        )
+
+    def test_german_from_block(self):
+        self.assertEqual(
+            "Allo! Follow up MIME!",
+            text.extract_from_plain(
+                """Allo! Follow up MIME!
+
+Von: somebody@example.com
+Gesendet: Dienstag, 25. November 2014 14:59
+An: Somebody
+Betreff: The manager has commented on your Loop
+
+Blah-blah-blah
+"""
+            ),
+        )
+
+    def test_french_multiline_from_block(self):
+        self.assertEqual(
+            "Lorem ipsum",
+            text.extract_from_plain(
+                """Lorem ipsum
+
+De : Brendan xxx [mailto:brendan.xxx@xxx.com]
+Envoyé : vendredi 23 janvier 2015 16:39
+À : Camille XXX
+Objet : Follow Up
+
+Blah-blah-blah
+"""
+            ),
+        )
+
+    def test_french_from_block(self):
+        self.assertEqual(
+            "Lorem ipsum",
+            text.extract_from_plain(
+                """Lorem ipsum
+
+    Le 23 janv. 2015 à 22:03, Brendan xxx
+    <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
+
+    Bonjour!"""
+            ),
+        )
+
+    def test_polish_from_block(self):
+        self.assertEqual(
+            "Lorem ipsum",
+            text.extract_from_plain(
+                """Lorem ipsum
+
+W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
+napisał:
+
+Blah!
+"""
+            ),
+        )
+
+    def test_danish_from_block(self):
+        self.assertEqual(
+            "Allo! Follow up MIME!",
+            text.extract_from_plain(
+                """Allo! Follow up MIME!
+
+Fra: somebody@example.com
+Sendt: 19. march 2011 12:10
+Til: Somebody
+Emne: The manager has commented on your Loop
+
+Blah-blah-blah
+"""
+            ),
+        )
+
+    def test_swedish_from_block(self):
+        self.assertEqual(
+            "Allo! Follow up MIME!",
+            text.extract_from_plain(
+                """Allo! Follow up MIME!
+Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
+Skickat: den 26 augusti 2015 14:45
+Till: Isacson Leiff
+Ämne: RE: Week 36
+
+Blah-blah-blah
+"""
+            ),
+        )
+
+    def test_swedish_from_line(self):
+        self.assertEqual(
+            "Lorem",
+            text.extract_from_plain(
+                """Lorem
+Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic.
+Normcore mumblecore laboris, fanny pack
+readymade eu blog chia pop-up freegan enim master cleanse.
+"""
+            ),
+        )
+
+    def test_norwegian_from_line(self):
+        self.assertEqual(
+            "Lorem",
+            text.extract_from_plain(
+                """Lorem
+På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic.
+Normcore mumblecore laboris, fanny pack
+readymade eu blog chia pop-up freegan enim master cleanse.
+"""
+            ),
+        )
+
+    def test_dutch_from_block(self):
+        self.assertEqual(
+            "Gluten-free culpa lo-fi et nesciunt nostrud.",
+            text.extract_from_plain(
+                """Gluten-free culpa lo-fi et nesciunt nostrud.
+
+Op 17-feb.-2015, om 13:18 heeft Julius Caesar
+<pantheon@rome.com> het volgende geschreven:
+
+Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
+"""
+            ),
+        )
+
+    def test_vietnamese_from_block(self):
+        self.assertEqual(
+            "Hello",
+            text.extract_from_plain(
+                """Hello
+
+Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết:
+
+> Xin chào
+"""
+            ),
+        )
+
+    def test_quotation_marker_false_positive(self):
+        msg_body = """Visit us now for assistance...
+>>> >>>  http://www.domain.com <<<
+Visit our site by clicking the link above"""
+        self.assertEqual(msg_body, text.extract_from_plain(msg_body))
+
+    def test_link_closed_with_quotation_marker_on_new_line(self):
+        msg_body = """8.45am-1pm
+
+From: somebody@example.com
+Date: Wed, 16 May 2012 00:15:02 -0600
+
+<http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT
+>  <bob@example.com <mailto:bob@example.com> >
+
+Requester: """
+        self.assertEqual("8.45am-1pm", text.extract_from_plain(msg_body))
+
+    def test_link_breaks_quotation_markers_sequence(self):
+        # link starts and ends on the same line
+        msg_body = """Blah
+
+On Thursday, October 25, 2012 at 3:03 PM, life is short. on Bob wrote:
+
+>
+> Post a response by replying to this email
+>
+(http://example.com/c/YzOTYzMmE) >
+> life is short. (http://example.com/c/YzMmE)
+>
+"""
+        self.assertEqual("Blah", text.extract_from_plain(msg_body))
+
+        # link starts after some text on one line and ends on another
+        msg_body = """Blah
+
+On Monday, 24 September, 2012 at 3:46 PM, bob wrote:
+
+> [Ticket #50] test from bob
+>
+> View ticket (http://example.com/action
+_nonce=3dd518)
+>
+"""
+        self.assertEqual("Blah", text.extract_from_plain(msg_body))
+
+    def test_from_block_starts_with_date(self):
+        msg_body = """Blah
+
+Date: Wed, 16 May 2012 00:15:02 -0600
+To: klizhentas@example.com
+
+"""
+        self.assertEqual("Blah", text.extract_from_plain(msg_body))
+
+    def test_bold_from_block(self):
+        msg_body = """Hi
+
+*From:* bob@example.com [mailto:
+bob@example.com]
+*Sent:* Wednesday, June 27, 2012 3:05 PM
+*To:* travis@example.com
+*Subject:* Hello
+
+"""
+        self.assertEqual("Hi", text.extract_from_plain(msg_body))
+
+    def test_weird_date_format_in_date_block(self):
+        msg_body = """Blah
+Date: Fri=2C 28 Sep 2012 10:55:48 +0000
+From: tickets@example.com
+To: bob@example.com
+Subject: [Ticket #8] Test
+
+"""
+        self.assertEqual("Blah", text.extract_from_plain(msg_body))
+
+    def test_dont_parse_quotations_for_forwarded_messages(self):
+        msg_body = """FYI
+
+---------- Forwarded message ----------
+From: bob@example.com
+Date: Tue, Sep 4, 2012 at 1:35 PM
+Subject: Two
+line subject
+To: rob@example.com
+
+Text"""
+        self.assertEqual(msg_body, text.extract_from_plain(msg_body))
+
+    def test_forwarded_message_in_quotations(self):
+        msg_body = """Blah
+
+-----Original Message-----
+
+FYI
+
+---------- Forwarded message ----------
+From: bob@example.com
+Date: Tue, Sep 4, 2012 at 1:35 PM
+Subject: Two
+line subject
+To: rob@example.com
+
+"""
+        self.assertEqual("Blah", text.extract_from_plain(msg_body))
+
+    def test_mark_message_lines(self):
+        # e - empty line
+        # s - splitter line
+        # m - line starting with quotation marker '>'
+        # t - the rest
+
+        lines = [
+            "Hello",
+            "",
+            # next line should be marked as splitter
+            "_____________",
+            "From: foo@bar.com",
+            "Date: Wed, 16 May 2012 00:15:02 -0600",
+            "",
+            "> Hi",
+            "",
+            "Signature",
+        ]
+        self.assertEqual("tesssemet", text.mark_message_lines(lines))
+
+        lines = [
+            "Just testing the email reply",
+            "",
+            "Robert J Samson",
+            "Sent from my iPhone",
+            "",
+            # all 3 next lines should be marked as splitters
+            "On Nov 30, 2011, at 12:47 PM, Skapture <",
+            (
+                "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4"
+                "@skapture-staging.mailgun.org>"
+            ),
+            "wrote:",
+            "",
+            "Tarmo Lehtpuu has posted the following message on",
+        ]
+        self.assertEqual("tettessset", text.mark_message_lines(lines))
+
+    def test_process_marked_lines(self):
+        # quotations and last message lines are mixed
+        # consider all to be a last message
+        markers = "tsemmtetm"
+        lines = [str(i) for i in range(len(markers))]
+        lines = [str(i) for i in range(len(markers))]
+
+        self.assertEqual(lines, text.process_marked_lines(lines, markers))
+
+        # no splitter => no markers
+        markers = "tmm"
+        lines = ["1", "2", "3"]
+        self.assertEqual(["1", "2", "3"], text.process_marked_lines(lines, markers))
+
+        # text after splitter without markers is quotation
+        markers = "tst"
+        lines = ["1", "2", "3"]
+        self.assertEqual(["1"], text.process_marked_lines(lines, markers))
+
+        # message + quotation + signature
+        markers = "tsmt"
+        lines = ["1", "2", "3", "4"]
+        self.assertEqual(["1", "4"], text.process_marked_lines(lines, markers))
+
+        # message + <quotation without markers> + nested quotation
+        markers = "tstsmt"
+        lines = ["1", "2", "3", "4", "5", "6"]
+        self.assertEqual(["1"], text.process_marked_lines(lines, markers))
+
+        # test links wrapped with paranthesis
+        # link starts on the marker line
+        markers = "tsmttem"
+        lines = [
+            "text",
+            "splitter",
+            ">View (http://example.com",
+            "/abc",
+            ")",
+            "",
+            "> quote",
+        ]
+        self.assertEqual(lines[:1], text.process_marked_lines(lines, markers))
+
+        # link starts on the new line
+        markers = "tmmmtm"
+        lines = [
+            "text",
+            ">" ">",
+            ">",
+            "(http://example.com) >  ",
+            "> life is short. (http://example.com)  ",
+        ]
+        self.assertEqual(lines[:1], text.process_marked_lines(lines, markers))
+
+        # check all "inline" replies
+        markers = "tsmtmtm"
+        lines = [
+            "text",
+            "splitter",
+            ">",
+            "(http://example.com)",
+            ">",
+            "inline  reply",
+            ">",
+        ]
+        self.assertEqual(lines, text.process_marked_lines(lines, markers))
+
+        # inline reply with link not wrapped in paranthesis
+        markers = "tsmtm"
+        lines = [
+            "text",
+            "splitter",
+            ">",
+            "inline reply with link http://example.com",
+            ">",
+        ]
+        self.assertEqual(lines, text.process_marked_lines(lines, markers))
+
+        # inline reply with link wrapped in paranthesis
+        markers = "tsmtm"
+        lines = ["text", "splitter", ">", "inline  reply (http://example.com)", ">"]
+        self.assertEqual(lines, text.process_marked_lines(lines, markers))
+
+    def test_preprocess(self):
+        msg = (
+            "Hello\n"
+            "See <http://google.com\n"
+            "> for more\n"
+            "information On Nov 30, 2011, at 12:47 PM, Somebody <\n"
+            "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n"
+            "@example.com>"
+            "wrote:\n"
+            "\n"
+            "> Hi"
+        )
+
+        # test the link is rewritten
+        # 'On <date> <person> wrote:' pattern starts from a new line
+        prepared_msg = (
+            "Hello\n"
+            "See @@http://google.com\n"
+            "@@ for more\n"
+            "information\n"
+            " On Nov 30, 2011, at 12:47 PM, Somebody <\n"
+            "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n"
+            "@example.com>"
+            "wrote:\n"
+            "\n"
+            "> Hi"
+        )
+        self.assertEqual(prepared_msg, utils.preprocess(msg, "\n"))
+
+        msg = """
+> <http://teemcl.mailgun.org/u/**aD1mZmZiNGU5ODQwMDNkZWZlMTExNm**
+
+> MxNjQ4Y2RmOTNlMCZyPXNlcmdleS5v**YnlraG92JTQwbWFpbGd1bmhxLmNvbS**
+
+> Z0PSUyQSZkPWUwY2U<http://example.org/u/aD1mZmZiNGU5ODQwMDNkZWZlMTExNmMxNjQ4Y>
+        """
+        self.assertEqual(msg, utils.preprocess(msg, "\n"))
+
+        # 'On <date> <person> wrote' shouldn't be spread across too many lines
+        msg = (
+            "Hello\n"
+            "How are you? On Nov 30, 2011, at 12:47 PM,\n "
+            "Example <\n"
+            "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n"
+            "@example.org>"
+            "wrote:\n"
+            "\n"
+            "> Hi"
+        )
+        self.assertEqual(msg, utils.preprocess(msg, "\n"))
+
+        msg = "Hello On Nov 30, smb wrote:\n" "Hi\n" "On Nov 29, smb wrote:\n" "hi"
+
+        prepared_msg = (
+            "Hello\n" " On Nov 30, smb wrote:\n" "Hi\n" "On Nov 29, smb wrote:\n" "hi"
+        )
+
+        self.assertEqual(prepared_msg, utils.preprocess(msg, "\n"))
+
+    def test_preprocess_postprocess_2_links(self):
+        msg_body = "<http://link1> <http://link2>"
+        self.assertEqual(msg_body, text.extract_from_plain(msg_body))
+
+    def test_feedback_below_left_unparsed(self):
+        msg_body = """Please enter your feedback below. Thank you.
+
+-------------------------------------
+Enter Feedback Below
+-------------------------------------
+
+The user experience was unparallelled. Please continue production.
+I'm sending payment to ensure
+that this line is intact."""
+
+        parsed = text.extract_from_plain(msg_body)
+        self.assertEqual(msg_body, parsed)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)

From d334265ed370109125180413df9893db63df5b6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Orze=C5=82?= <kuba.orzel@hotmail.com>
Date: Tue, 23 Apr 2024 15:15:59 +0200
Subject: [PATCH 6/6] =?UTF-8?q?Usuni=C4=99cie=20pustych=20linii?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 extract_raw_content/constants.py       | 1 -
 extract_raw_content/html_quotations.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py
index 3e07a94..e5da66b 100644
--- a/extract_raw_content/constants.py
+++ b/extract_raw_content/constants.py
@@ -1,6 +1,5 @@
 import re
 
-
 MAX_LINES_COUNT = 1000
 SPLITTER_MAX_LINES = 6
 _MAX_TAGS_COUNT = 419
diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py
index b361661..f312eac 100644
--- a/extract_raw_content/html_quotations.py
+++ b/extract_raw_content/html_quotations.py
@@ -1,4 +1,5 @@
 import re
+
 from lxml.cssselect import CSSSelector
 
 CHECKPOINT_PREFIX = "#!%!"