From 12f5d25ce43a8123073280f06811308589a9eedc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Fri, 12 Apr 2024 12:21:39 +0200 Subject: [PATCH 1/6] =?UTF-8?q?Zast=C4=85pienie=20biblioteki=20'talon',=20?= =?UTF-8?q?jedynie=20niezb=C4=99dnymi=20funkcjami=20(z=20w/w=20biblioteki)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + extract_raw_content/__init__.py | 0 extract_raw_content/constants.py | 154 ++++++++++++++ extract_raw_content/html.py | 280 +++++++++++++++++++++++++ extract_raw_content/html_quotations.py | 224 ++++++++++++++++++++ extract_raw_content/text.py | 131 ++++++++++++ extract_raw_content/utils.py | 84 ++++++++ mail_parser.py | 13 +- requirements.txt | 37 +++- 9 files changed, 914 insertions(+), 11 deletions(-) create mode 100644 extract_raw_content/__init__.py create mode 100644 extract_raw_content/constants.py create mode 100644 extract_raw_content/html.py create mode 100644 extract_raw_content/html_quotations.py create mode 100644 extract_raw_content/text.py create mode 100644 extract_raw_content/utils.py diff --git a/.gitignore b/.gitignore index 4922fa4..7cd7ba0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__/ .env .idea +.vscode/ +venv/ diff --git a/extract_raw_content/__init__.py b/extract_raw_content/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py new file mode 100644 index 0000000..f490ed7 --- /dev/null +++ b/extract_raw_content/constants.py @@ -0,0 +1,154 @@ +import regex as re + +MAX_LINES_COUNT = 1000 +SPLITTER_MAX_LINES = 6 +_MAX_TAGS_COUNT = 419 +_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] +_HARDBREAKS = ['br', 'hr', 'tr'] +_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") + +QUOT_PATTERN = re.compile('^>+ ?') +RE_PARENTHESIS_LINK = re.compile("\(https?://") +RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') +RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) +RE_DELIMITER = re.compile('\r?\n') +RE_LINK = re.compile('<(http://[^>]*)>') +RE_ON_DATE_SMB_WROTE = re.compile( + u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( + # Beginning of the line + u'|'.join(( + # English + 'On', + # French + 'Le', + # Polish + 'W dniu', + # Dutch + 'Op', + # German + 'Am', + # Norwegian + u'På', + # Swedish, Danish + 'Den', + # Vietnamese + u'Vào', + )), + # Date and sender separator + u'|'.join(( + # most languages separate date and sender address by comma + ',', + # polish date and sender address separator + u'użytkownik' + )), + # Ending of the line + u'|'.join(( + # English + 'wrote', 'sent', + # French + u'a écrit', + # Polish + u'napisał', + # Dutch + 'schreef','verzond','geschreven', + # German + 'schrieb', + # Norwegian, Swedish + 'skrev', + # Vietnamese + u'đã viết', + )) + )) +RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( + u'|'.join(( + # English + 'Original Message', 'Reply Message', + # German + u'Ursprüngliche Nachricht', 'Antwort Nachricht', + # Danish + 'Oprindelig meddelelse', + ))), re.I) +RE_ON_DATE_WROTE_SMB = re.compile( + u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( + # Beginning of the line + u'|'.join(( + 'Op', + #German + 'Am' + )), + # Ending of the line + u'|'.join(( + # Dutch + 'schreef','verzond','geschreven', + # German + 'schrieb' + )) + ) + ) + +RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( + u'|'.join(( + # "From" in different languages. + 'From', 'Van', 'De', 'Von', 'Fra', u'Från', + # "Date" in different languages. + 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', + ))), re.I) +RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( + u'|'.join(( + # English + 'wrote', + ))), re.I) +RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) +RE_QUOTATION = re.compile( + r''' + ( + # quotation border: splitter line or a number of quotation marker lines + (?: + s + | + (?:me*){2,} + ) + + # quotation lines could be marked as splitter or text, etc. + .* + + # but we expect it to end with a quotation marker line + me* + ) + + # after quotations should be text only or nothing at all + [te]*$ + ''', re.VERBOSE) +RE_EMPTY_QUOTATION = re.compile( + r''' + ( + # quotation border: splitter line or a number of quotation marker lines + (?: + (?:se*)+ + | + (?:me*){2,} + ) + ) + e* + ''', re.VERBOSE) + +SPLITTER_PATTERNS = [ + RE_ORIGINAL_MESSAGE, + RE_ON_DATE_SMB_WROTE, + RE_ON_DATE_WROTE_SMB, + RE_FROM_COLON_OR_DATE_COLON, + # 02.04.2012 14:20 пользователь "bob@example.com" < + # bob@xxx.mailgun.org> написал: + re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), + # 2014-10-17 11:28 GMT+03:00 Bob < + # bob@example.com>: + re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), + # Thu, 26 Jun 2014 14:00:51 +0400 Bob : + re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' + '( \S+){3,6}@\S+:'), + # Sent from Samsung MobileName wrote: + re.compile('Sent from Samsung .*@.*> wrote'), + RE_ANDROID_WROTE, + RE_POLYMAIL + ] + diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py new file mode 100644 index 0000000..10ccd93 --- /dev/null +++ b/extract_raw_content/html.py @@ -0,0 +1,280 @@ +import regex as re +import six +import html5lib +from lxml import html +from lxml.html import html5parser +from lxml.cssselect import CSSSelector +from copy import deepcopy + +from . import constants as const +from . import utils +from . import html_quotations + + +def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): + """Run regexes against message's marked lines to strip quotations. + + Return only last message lines. + >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem']) + ['Hello'] + + Also returns return_flags. + return_flags = [were_lines_deleted, first_deleted_line, + last_deleted_line] + """ + markers = ''.join(markers) + # if there are no splitter there should be no markers + if 's' not in markers and not re.search('(me*){3}', markers): + markers = markers.replace('m', 't') + if re.match('[te]*f', markers): + return_flags[:] = [False, -1, -1] + return lines + # inlined reply + # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' + # both 't' entries should be found + for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): + # long links could break sequence of quotation lines but they shouldn't + # be considered an inline reply + links = ( + const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or + const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())) + if not links: + return_flags[:] = [False, -1, -1] + return lines + + # cut out text lines coming after splitter if there are no markers there + quotation = re.search('(se*)+((t|f)+e*)+', markers) + if quotation: + return_flags[:] = [True, quotation.start(), len(lines)] + return lines[:quotation.start()] + + # handle the case with markers + quotation = (const.RE_QUOTATION.search(markers) or + const.RE_EMPTY_QUOTATION.search(markers)) + if quotation: + return_flags[:] = True, quotation.start(1), quotation.end(1) + return lines[:quotation.start(1)] + lines[quotation.end(1):] + + return_flags[:] = [False, -1, -1] + return lines + + +def mark_message_lines(lines): + """Mark message lines with markers to distinguish quotation lines. + + Markers: + + * e - empty line + * m - line that starts with quotation marker '>' + * s - splitter line + * t - presumably lines from the last message in the conversation + + >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) + 'tsem' + """ + markers = ['e' for _ in lines] + i = 0 + while i < len(lines): + if not lines[i].strip(): + markers[i] = 'e' # empty line + elif const.QUOT_PATTERN.match(lines[i]): + markers[i] = 'm' # line with quotation marker + elif const.RE_FWD.match(lines[i]): + markers[i] = 'f' # ---- Forwarded message ---- + else: + # in case splitter is spread across several lines + splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES])) + + if splitter: + # append as many splitter markers as lines in splitter + splitter_lines = splitter.group().splitlines() + for j in range(len(splitter_lines)): + markers[i + j] = 's' + + # skip splitter lines + i += len(splitter_lines) - 1 + else: + # probably the line from the last message in the conversation + markers[i] = 't' + i += 1 + return ''.join(markers) + + +def _html5lib_parser(): + """ + html5lib is a pure-python library that conforms to the WHATWG HTML spec + and is not vulnarable to certain attacks common for XML libraries + """ + return html5lib.HTMLParser( + # build lxml tree + html5lib.treebuilders.getTreeBuilder("lxml"), + # remove namespace value from inside lxml.html.html5paser element tag + # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" + # instead of "div", throwing the algo off + namespaceHTMLElements=False + ) + + +def _rm_excessive_newlines(s): + """Remove excessive newlines that often happen due to tons of divs + """ + return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() + + +def _encode_utf8(s): + """Encode in 'utf-8' if unicode + """ + return s.encode('utf-8') if isinstance(s, six.text_type) else s + + +def html_too_big(s): + if isinstance(s, six.text_type): + s = s.encode('utf8') + return s.count(b'<') > const._MAX_TAGS_COUNT + + +def html_document_fromstring(s): + """Parse html tree from string. Return None if the string can't be parsed. + """ + if isinstance(s, six.text_type): + s = s.encode('utf8') + try: + if html_too_big(s): + return None + + return html5parser.document_fromstring(s, parser=_html5lib_parser()) + except Exception: + pass + + +def html_tree_to_text(tree): + for style in CSSSelector('style')(tree): + style.getparent().remove(style) + for c in tree.xpath('//comment()'): + parent = c.getparent() + # comment with no parent does not impact produced text + if parent is None: + continue + parent.remove(c) + text = "" + for el in tree.iter(): + el_text = (el.text or '') + (el.tail or '') + if len(el_text) > 1: + if el.tag in const._BLOCKTAGS: + text += "\n" + if el.tag == 'li': + text += " * " + text += el_text.strip() + " " + + # add href to the output + href = el.attrib.get('href') + if href: + text += "(%s) " % href + if el.tag in const._HARDBREAKS and text and not text.endswith("\n"): + text += "\n" + retval = _rm_excessive_newlines(text) + return _encode_utf8(retval) + + +def _readable_text_empty(html_tree): + return not bool(html_tree_to_text(html_tree).strip()) + + +def _extract_from_html(msg_body): + """ + Extract not quoted message from provided html message body + using tags and plain text algorithm. + + Cut out the 'blockquote', 'gmail_quote' tags. + Cut Microsoft quotations. + + Then use plain text algorithm to cut out splitter or + leftover quotation. + This works by adding checkpoint text to all html tags, + then converting html to text, + then extracting quotations from text, + then checking deleted checkpoints, + then deleting necessary tags. + """ + if msg_body.strip() == b'': + return msg_body + + msg_body = msg_body.replace(b'\r\n', b'\n') + html_tree = html_document_fromstring(msg_body) + + if html_tree is None: + return msg_body + cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or + html_quotations.cut_zimbra_quote(html_tree) or + html_quotations.cut_blockquote(html_tree) or + html_quotations.cut_microsoft_quote(html_tree) or + html_quotations.cut_by_id(html_tree) or + html_quotations.cut_from_block(html_tree) + ) + html_tree_copy = deepcopy(html_tree) + + number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) + quotation_checkpoints = [False] * number_of_checkpoints + plain_text = html_tree_to_text(html_tree) + plain_text = utils.preprocess(plain_text, '\n', content_type='text/html') + lines = plain_text.splitlines() + # Don't process too long messages + if len(lines) > const.MAX_LINES_COUNT: + return msg_body + # Collect checkpoints on each line + line_checkpoints = [ + [int(i[4:-4]) # Only checkpoint number + for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] + for line in lines] + # Remove checkpoints + lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) + for line in lines] + + # Use plain text quotation extracting algorithm + markers = mark_message_lines(lines) + return_flags = [] + process_marked_lines(lines, markers, return_flags) + lines_were_deleted, first_deleted, last_deleted = return_flags + + if not lines_were_deleted and not cut_quotations: + return msg_body + if lines_were_deleted: + #collect checkpoints from deleted lines + for i in range(first_deleted, last_deleted): + for checkpoint in line_checkpoints[i]: + quotation_checkpoints[checkpoint] = True + # Remove tags with quotation checkpoints + html_quotations.delete_quotation_tags( + html_tree_copy, 0, quotation_checkpoints + ) + if _readable_text_empty(html_tree_copy): + return msg_body + return html.tostring(html_tree_copy) + +def extract_from_html(msg_body): + """ + Extract not quoted message from provided html message body + using tags and plain text algorithm. + + Cut out the 'blockquote', 'gmail_quote' tags. + Cut Microsoft quotations. + + Then use plain text algorithm to cut out splitter or + leftover quotation. + This works by adding checkpoint text to all html tags, + then converting html to text, + then extracting quotations from text, + then checking deleted checkpoints, + then deleting necessary tags. + + Returns a unicode string. + """ + if isinstance(msg_body, six.text_type): + msg_body = msg_body.encode('utf8') + elif not isinstance(msg_body, bytes): + msg_body = msg_body.encode('ascii') + + result = _extract_from_html(msg_body) + if isinstance(result, bytes): + result = result.decode('utf8') + return result diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py new file mode 100644 index 0000000..08ce578 --- /dev/null +++ b/extract_raw_content/html_quotations.py @@ -0,0 +1,224 @@ +import regex as re +from lxml.cssselect import CSSSelector + + +CHECKPOINT_PREFIX = '#!%!' +CHECKPOINT_SUFFIX = '!%!#' +CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) +# HTML quote indicators (tag ids) +QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] +RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) + + +def cssselect(expr, tree): + return CSSSelector(expr)(tree) + + +def add_checkpoint(html_note, counter): + """Recursively adds checkpoints to html tree. + """ + if html_note.text: + html_note.text = (html_note.text + CHECKPOINT_PREFIX + + str(counter) + CHECKPOINT_SUFFIX) + else: + html_note.text = (CHECKPOINT_PREFIX + str(counter) + + CHECKPOINT_SUFFIX) + counter += 1 + + for child in html_note.iterchildren(): + counter = add_checkpoint(child, counter) + + if html_note.tail: + html_note.tail = (html_note.tail + CHECKPOINT_PREFIX + + str(counter) + CHECKPOINT_SUFFIX) + else: + html_note.tail = (CHECKPOINT_PREFIX + str(counter) + + CHECKPOINT_SUFFIX) + counter += 1 + return counter + + +def delete_quotation_tags(html_note, counter, quotation_checkpoints): + """Deletes tags with quotation checkpoints from html tree. + """ + tag_in_quotation = True + + if quotation_checkpoints[counter]: + html_note.text = '' + else: + tag_in_quotation = False + counter += 1 + + quotation_children = [] # Children tags which are in quotation. + for child in html_note.iterchildren(): + counter, child_tag_in_quotation = delete_quotation_tags( + child, counter, + quotation_checkpoints + ) + if child_tag_in_quotation: + quotation_children.append(child) + + if quotation_checkpoints[counter]: + html_note.tail = '' + else: + tag_in_quotation = False + counter += 1 + + if tag_in_quotation: + return counter, tag_in_quotation + else: + # Remove quotation children. + for child in quotation_children: + html_note.remove(child) + return counter, tag_in_quotation + + +def cut_gmail_quote(html_message): + ''' Cuts the outermost block element with class gmail_quote. ''' + gmail_quote = cssselect('div.gmail_quote', html_message) + if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): + gmail_quote[0].getparent().remove(gmail_quote[0]) + return True + + +def cut_microsoft_quote(html_message): + ''' Cuts splitter block and all following blocks. ''' + splitter = html_message.xpath( + #outlook 2007, 2010 (international) + "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" + "padding:3.0pt 0cm 0cm 0cm']|" + #outlook 2007, 2010 (american) + "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" + "padding:3.0pt 0in 0in 0in']|" + #outlook 2013 (international) + "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" + "padding:3.0pt 0cm 0cm 0cm']|" + #outlook 2013 (american) + "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" + "padding:3.0pt 0in 0in 0in']|" + #windows mail + "//div[@style='padding-top: 5px; " + "border-top-color: rgb(229, 229, 229); " + "border-top-width: 1px; border-top-style: solid;']" + ) + + if splitter: + splitter = splitter[0] + #outlook 2010 + if splitter == splitter.getparent().getchildren()[0]: + splitter = splitter.getparent() + else: + #outlook 2003 + splitter = html_message.xpath( + "//div" + "/div[@class='MsoNormal' and @align='center' " + "and @style='text-align:center']" + "/font" + "/span" + "/hr[@size='3' and @width='100%' and @align='center' " + "and @tabindex='-1']" + ) + if len(splitter): + splitter = splitter[0] + splitter = splitter.getparent().getparent() + splitter = splitter.getparent().getparent() + + if len(splitter): + parent = splitter.getparent() + after_splitter = splitter.getnext() + while after_splitter is not None: + parent.remove(after_splitter) + after_splitter = splitter.getnext() + parent.remove(splitter) + return True + + return False + + +def cut_by_id(html_message): + found = False + for quote_id in QUOTE_IDS: + quote = cssselect('#{}'.format(quote_id), html_message) + if quote: + found = True + quote[0].getparent().remove(quote[0]) + return found + + +def cut_blockquote(html_message): + ''' Cuts the last non-nested blockquote with wrapping elements.''' + quote = html_message.xpath( + '(.//blockquote)' + '[not(@class="gmail_quote") and not(ancestor::blockquote)]' + '[last()]') + + if quote: + quote = quote[0] + quote.getparent().remove(quote) + return True + + +def cut_from_block(html_message): + """Cuts div tag which wraps block starting with "From:".""" + # handle the case when From: block is enclosed in some tag + block = html_message.xpath( + ("//*[starts-with(mg:text_content(), 'From:')]|" + "//*[starts-with(mg:text_content(), 'Date:')]")) + + if block: + block = block[-1] + parent_div = None + while block.getparent() is not None: + if block.tag == 'div': + parent_div = block + break + block = block.getparent() + if parent_div is not None: + maybe_body = parent_div.getparent() + # In cases where removing this enclosing div will remove all + # content, we should assume the quote is not enclosed in a tag. + parent_div_is_all_content = ( + maybe_body is not None and maybe_body.tag == 'body' and + len(maybe_body.getchildren()) == 1) + + if not parent_div_is_all_content: + parent = block.getparent() + next_sibling = block.getnext() + + # remove all tags after found From block + # (From block and quoted message are in separate divs) + while next_sibling is not None: + parent.remove(block) + block = next_sibling + next_sibling = block.getnext() + + # remove the last sibling (or the + # From block if no siblings) + if block is not None: + parent.remove(block) + + return True + else: + return False + # handle the case when From: block goes right after e.g.
+ # and not enclosed in some tag + block = html_message.xpath( + ("//*[starts-with(mg:tail(), 'From:')]|" + "//*[starts-with(mg:tail(), 'Date:')]")) + if block: + block = block[0] + + if RE_FWD.match(block.getparent().text or ''): + return False + + while(block.getnext() is not None): + block.getparent().remove(block.getnext()) + block.getparent().remove(block) + return True + + +def cut_zimbra_quote(html_message): + zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') + if zDivider: + zDivider[0].getparent().remove(zDivider[0]) + return True diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py new file mode 100644 index 0000000..6023368 --- /dev/null +++ b/extract_raw_content/text.py @@ -0,0 +1,131 @@ +import regex as re + +from . import constants as const +from . import utils + + +def get_delimiter(msg_body): + delimiter = const.RE_DELIMITER.search(msg_body) + if delimiter: + delimiter = delimiter.group() + else: + delimiter = '\n' + return delimiter + + +def mark_message_lines(lines): + """Mark message lines with markers to distinguish quotation lines. + + Markers: + + * e - empty line + * m - line that starts with quotation marker '>' + * s - splitter line + * t - presumably lines from the last message in the conversation + + >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) + 'tsem' + """ + markers = ['e' for _ in lines] + i = 0 + while i < len(lines): + if not lines[i].strip(): + markers[i] = 'e' # empty line + elif const.QUOT_PATTERN.match(lines[i]): + markers[i] = 'm' # line with quotation marker + elif const.RE_FWD.match(lines[i]): + markers[i] = 'f' # ---- Forwarded message ---- + else: + # in case splitter is spread across several lines + splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES])) + + if splitter: + # append as many splitter markers as lines in splitter + splitter_lines = splitter.group().splitlines() + for j in range(len(splitter_lines)): + markers[i + j] = 's' + + # skip splitter lines + i += len(splitter_lines) - 1 + else: + # probably the line from the last message in the conversation + markers[i] = 't' + i += 1 + + return ''.join(markers) + + +def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): + """Run regexes against message's marked lines to strip quotations. + + Return only last message lines. + >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem']) + ['Hello'] + + Also returns return_flags. + return_flags = [were_lines_deleted, first_deleted_line, + last_deleted_line] + """ + markers = ''.join(markers) + # if there are no splitter there should be no markers + if 's' not in markers and not re.search('(me*){3}', markers): + markers = markers.replace('m', 't') + + if re.match('[te]*f', markers): + return_flags[:] = [False, -1, -1] + return lines + + # inlined reply + # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' + # both 't' entries should be found + for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): + # long links could break sequence of quotation lines but they shouldn't + # be considered an inline reply + links = ( + const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or + const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())) + if not links: + return_flags[:] = [False, -1, -1] + return lines + + # cut out text lines coming after splitter if there are no markers there + quotation = re.search('(se*)+((t|f)+e*)+', markers) + if quotation: + return_flags[:] = [True, quotation.start(), len(lines)] + return lines[:quotation.start()] + + # handle the case with markers + quotation = (const.RE_QUOTATION.search(markers) or + const.RE_EMPTY_QUOTATION.search(markers)) + + if quotation: + return_flags[:] = True, quotation.start(1), quotation.end(1) + return lines[:quotation.start(1)] + lines[quotation.end(1):] + + return_flags[:] = [False, -1, -1] + return lines + + +def postprocess(msg_body): + """Make up for changes done at preprocessing message. + + Replace link brackets back to '<' and '>'. + """ + return re.sub(const.RE_NORMALIZED_LINK, r'<\1>', msg_body).strip() + + +def extract_from_plain(msg_body): + """Extracts a non quoted message from provided plain text.""" + stripped_text = msg_body + + delimiter = get_delimiter(msg_body) + msg_body = utils.preprocess(msg_body, delimiter) + # don't process too long messages + lines = msg_body.splitlines()[:const.MAX_LINES_COUNT] + markers = mark_message_lines(lines) + lines = process_marked_lines(lines, markers) + + # concatenate lines, change links back, strip and return + msg_body = delimiter.join(lines) + msg_body = postprocess(msg_body) + return msg_body diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py new file mode 100644 index 0000000..cabc18a --- /dev/null +++ b/extract_raw_content/utils.py @@ -0,0 +1,84 @@ +import regex as re +from lxml import etree + +from . import constants as const + + +def text_content(context): + """XPath Extension function to return a node text content.""" + return context.context_node.xpath("string()").strip() + + +def tail(context): + """XPath Extension function to return a node tail text.""" + return context.context_node.tail or '' + + +def register_xpath_extensions(): + ns = etree.FunctionNamespace("http://mailgun.net") + ns.prefix = 'mg' + ns['text_content'] = text_content + ns['tail'] = tail + + +def _replace_link_brackets(msg_body): + """ + Normalize links i.e. replace '<', '>' wrapping the link with some symbols + so that '>' closing the link couldn't be mistakenly taken for quotation + marker. + + Converts msg_body into a unicode + """ + if isinstance(msg_body, bytes): + msg_body = msg_body.decode('utf8') + + def link_wrapper(link): + newline_index = msg_body[:link.start()].rfind("\n") + if msg_body[newline_index + 1] == ">": + return link.group() + else: + return "@@%s@@" % link.group(1) + msg_body = re.sub(const.RE_LINK, link_wrapper, msg_body) + return msg_body + + +def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): + """ + Splits line in two if splitter pattern preceded by some text on the same + line (done only for 'On wrote:' pattern. + """ + def splitter_wrapper(splitter): + """Wraps splitter with new line""" + if splitter.start() and msg_body[splitter.start() - 1] != '\n': + return '%s%s' % (delimiter, splitter.group()) + else: + return splitter.group() + + if content_type == 'text/plain': + msg_body = re.sub(const.RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body) + return msg_body + + +def preprocess(msg_body, delimiter, content_type='text/plain'): + """Prepares msg_body for being stripped. + + Replaces link brackets so that they couldn't be taken for quotation marker. + Splits line in two if splitter pattern preceded by some text on the same + line (done only for 'On wrote:' pattern). + + Converts msg_body into a unicode. + """ + msg_body = _replace_link_brackets(msg_body) + msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) + return msg_body + + +def is_splitter(line): + ''' + Returns Matcher object if provided string is a splitter and + None otherwise. + ''' + for pattern in const.SPLITTER_PATTERNS: + matcher = re.match(pattern, line) + if matcher: + return matcher diff --git a/mail_parser.py b/mail_parser.py index d1fbcd3..b63a97a 100644 --- a/mail_parser.py +++ b/mail_parser.py @@ -6,12 +6,13 @@ import re import uuid from io import BytesIO - import mailparser -import talon from html2text import html2text -talon.init() +from extract_raw_content.text import extract_from_plain +from extract_raw_content.html import extract_from_html +from extract_raw_content.utils import register_xpath_extensions + decoder_map = { "base64": base64.b64decode, @@ -26,6 +27,8 @@ EML_MIME = "message/rfc822" BINARY_MIME = "application/octet-stream" +register_xpath_extensions() + def get_text(mail): raw_content, html_content, plain_content, html_quote, plain_quote = ( @@ -38,13 +41,13 @@ def get_text(mail): if mail.text_html: raw_content = "".join(mail.text_html).replace("\r\n", "\n") - html_content = talon.quotations.extract_from_html(raw_content) + html_content = extract_from_html(raw_content) html_quote = raw_content.replace(html_content, "") plain_content = html2text(html_content) if mail.text_plain or not plain_content: raw_content = "".join(mail.text_plain) - plain_content = talon.quotations.extract_from_plain(raw_content) + plain_content = extract_from_plain(raw_content) plain_quote = raw_content.replace(plain_content, "") # 'content' item holds plain_content and 'quote' item holds plain_quote diff --git a/requirements.txt b/requirements.txt index 3b7f9a8..cf56c79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,34 @@ -requests==2.31.0 -mail-parser==3.15.0 +asttokens==2.4.1 +certifi==2024.2.2 +charset-normalizer==3.3.2 +cssselect==1.2.0 +decorator==5.1.1 +exceptiongroup==1.2.0 +executing==2.0.1 html2text==2024.2.26 -# pypi talon to be replaced - last update 2017 -talon==1.4.4 -# forked talon - scikit version does not match training data -# git+https://github.com/PiotrIw/talon.git@master +html5lib==1.1 +idna==3.7 ipdb==0.13.13 +ipython==8.23.0 +jedi==0.19.1 +lxml==5.2.1 +mail-parser==3.15.0 +matplotlib-inline==0.1.6 +parso==0.8.4 +pexpect==4.9.0 +prompt-toolkit==3.0.43 +ptyprocess==0.7.0 +pure-eval==0.2.2 +Pygments==2.17.2 +regex==2023.12.25 +requests==2.31.0 sentry-sdk==1.43.0 +simplejson==3.19.2 +six==1.16.0 +stack-data==0.6.3 +tomli==2.0.1 +traitlets==5.14.2 +typing_extensions==4.11.0 +urllib3==2.2.1 +wcwidth==0.2.13 +webencodings==0.5.1 From 0ea7e7d01ce02e5d359d5ddab60ebc3b36a959d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Sat, 20 Apr 2024 19:26:39 +0200 Subject: [PATCH 2/6] Added lint-all changes --- extract_raw_content/constants.py | 244 +++++++++++++++---------- extract_raw_content/html.py | 137 +++++++------- extract_raw_content/html_quotations.py | 125 +++++++------ extract_raw_content/text.py | 55 +++--- extract_raw_content/utils.py | 34 ++-- mail_parser.py | 4 +- 6 files changed, 332 insertions(+), 267 deletions(-) diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py index f490ed7..3e07a94 100644 --- a/extract_raw_content/constants.py +++ b/extract_raw_content/constants.py @@ -1,106 +1,152 @@ -import regex as re +import re + MAX_LINES_COUNT = 1000 SPLITTER_MAX_LINES = 6 _MAX_TAGS_COUNT = 419 -_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] -_HARDBREAKS = ['br', 'hr', 'tr'] +_BLOCKTAGS = ["div", "p", "ul", "li", "h1", "h2", "h3"] +_HARDBREAKS = ["br", "hr", "tr"] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") -QUOT_PATTERN = re.compile('^>+ ?') -RE_PARENTHESIS_LINK = re.compile("\(https?://") -RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') +QUOT_PATTERN = re.compile("^>+ ?") +RE_PARENTHESIS_LINK = re.compile(r"\(https?://") +RE_NORMALIZED_LINK = re.compile("@@(http://[^>@]*)@@") RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) -RE_DELIMITER = re.compile('\r?\n') -RE_LINK = re.compile('<(http://[^>]*)>') +RE_DELIMITER = re.compile("\r?\n") +RE_LINK = re.compile("<(http://[^>]*)>") RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( + "(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)".format( # Beginning of the line - u'|'.join(( - # English - 'On', - # French - 'Le', - # Polish - 'W dniu', - # Dutch - 'Op', - # German - 'Am', - # Norwegian - u'På', - # Swedish, Danish - 'Den', - # Vietnamese - u'Vào', - )), + "|".join( + ( + # English + "On", + # French + "Le", + # Polish + "W dniu", + # Dutch + "Op", + # German + "Am", + # Norwegian + "På", + # Swedish, Danish + "Den", + # Vietnamese + "Vào", + ) + ), # Date and sender separator - u'|'.join(( - # most languages separate date and sender address by comma - ',', - # polish date and sender address separator - u'użytkownik' - )), + "|".join( + ( + # most languages separate date and sender address by comma + ",", + # polish date and sender address separator + "użytkownik", + ) + ), # Ending of the line - u'|'.join(( - # English - 'wrote', 'sent', - # French - u'a écrit', - # Polish - u'napisał', - # Dutch - 'schreef','verzond','geschreven', - # German - 'schrieb', - # Norwegian, Swedish - 'skrev', - # Vietnamese - u'đã viết', - )) - )) -RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( - u'|'.join(( - # English - 'Original Message', 'Reply Message', - # German - u'Ursprüngliche Nachricht', 'Antwort Nachricht', - # Danish - 'Oprindelig meddelelse', - ))), re.I) + "|".join( + ( + # English + "wrote", + "sent", + # French + "a écrit", + # Polish + "napisał", + # Dutch + "schreef", + "verzond", + "geschreven", + # German + "schrieb", + # Norwegian, Swedish + "skrev", + # Vietnamese + "đã viết", + ) + ), + ) +) +RE_ORIGINAL_MESSAGE = re.compile( + r"[\s]*[-]+[ ]*({})[ ]*[-]+".format( + "|".join( + ( + # English + "Original Message", + "Reply Message", + # German + "Ursprüngliche Nachricht", + "Antwort Nachricht", + # Danish + "Oprindelig meddelelse", + ) + ) + ), + re.I, +) RE_ON_DATE_WROTE_SMB = re.compile( - u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( + "(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)".format( # Beginning of the line - u'|'.join(( - 'Op', - #German - 'Am' - )), + "|".join( + ( + "Op", + # German + "Am", + ) + ), # Ending of the line - u'|'.join(( - # Dutch - 'schreef','verzond','geschreven', - # German - 'schrieb' - )) - ) + "|".join( + ( + # Dutch + "schreef", + "verzond", + "geschreven", + # German + "schrieb", + ) + ), ) +) -RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( - u'|'.join(( - # "From" in different languages. - 'From', 'Van', 'De', 'Von', 'Fra', u'Från', - # "Date" in different languages. - 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', - ))), re.I) -RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( - u'|'.join(( - # English - 'wrote', - ))), re.I) -RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) +RE_FROM_COLON_OR_DATE_COLON = re.compile( + "(_+\r?\n)?[\\s]*(:?[*]?{})[\\s]?:[*]?.*".format( + "|".join( + ( + # "From" in different languages. + "From", + "Van", + "De", + "Von", + "Fra", + "Från", + # "Date" in different languages. + "Date", + "Datum", + "Envoyé", + "Skickat", + "Sendt", + ) + ) + ), + re.I, +) +RE_ANDROID_WROTE = re.compile( + r"[\s]*[-]+.*({})[ ]*[-]+".format( + "|".join( + ( + # English + "wrote", + ) + ) + ), + re.I, +) +RE_POLYMAIL = re.compile(r"On.*\s{2}<\smailto:.*\s> wrote:", re.I) RE_QUOTATION = re.compile( - r''' + r""" ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -118,9 +164,11 @@ # after quotations should be text only or nothing at all [te]*$ - ''', re.VERBOSE) + """, + re.VERBOSE, +) RE_EMPTY_QUOTATION = re.compile( - r''' + r""" ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -130,7 +178,9 @@ ) ) e* - ''', re.VERBOSE) + """, + re.VERBOSE, +) SPLITTER_PATTERNS = [ RE_ORIGINAL_MESSAGE, @@ -139,16 +189,16 @@ RE_FROM_COLON_OR_DATE_COLON, # 02.04.2012 14:20 пользователь "bob@example.com" < # bob@xxx.mailgun.org> написал: - re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), + re.compile(r"(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), # 2014-10-17 11:28 GMT+03:00 Bob < # bob@example.com>: - re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), + re.compile(r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), # Thu, 26 Jun 2014 14:00:51 +0400 Bob : - re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' - '( \S+){3,6}@\S+:'), + re.compile( + r"\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?" r"( \S+){3,6}@\S+:" + ), # Sent from Samsung MobileName wrote: - re.compile('Sent from Samsung .*@.*> wrote'), + re.compile("Sent from Samsung .*@.*> wrote"), RE_ANDROID_WROTE, - RE_POLYMAIL - ] - + RE_POLYMAIL, +] diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py index 10ccd93..bc1f4b6 100644 --- a/extract_raw_content/html.py +++ b/extract_raw_content/html.py @@ -1,14 +1,13 @@ -import regex as re -import six +from copy import deepcopy + import html5lib +import re from lxml import html -from lxml.html import html5parser from lxml.cssselect import CSSSelector -from copy import deepcopy +from lxml.html import html5parser from . import constants as const -from . import utils -from . import html_quotations +from . import html_quotations, utils def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): @@ -22,38 +21,39 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): return_flags = [were_lines_deleted, first_deleted_line, last_deleted_line] """ - markers = ''.join(markers) + markers = "".join(markers) # if there are no splitter there should be no markers - if 's' not in markers and not re.search('(me*){3}', markers): - markers = markers.replace('m', 't') - if re.match('[te]*f', markers): + if "s" not in markers and not re.search("(me*){3}", markers): + markers = markers.replace("m", "t") + if re.match("[te]*f", markers): return_flags[:] = [False, -1, -1] return lines # inlined reply # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' # both 't' entries should be found - for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): + for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers): # long links could break sequence of quotation lines but they shouldn't # be considered an inline reply - links = ( - const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or - const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())) + links = const.RE_PARENTHESIS_LINK.search( + lines[inline_reply.start() - 1] + ) or const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()) if not links: return_flags[:] = [False, -1, -1] return lines # cut out text lines coming after splitter if there are no markers there - quotation = re.search('(se*)+((t|f)+e*)+', markers) + quotation = re.search("(se*)+((t|f)+e*)+", markers) if quotation: return_flags[:] = [True, quotation.start(), len(lines)] - return lines[:quotation.start()] + return lines[: quotation.start()] # handle the case with markers - quotation = (const.RE_QUOTATION.search(markers) or - const.RE_EMPTY_QUOTATION.search(markers)) + quotation = const.RE_QUOTATION.search(markers) or const.RE_EMPTY_QUOTATION.search( + markers + ) if quotation: return_flags[:] = True, quotation.start(1), quotation.end(1) - return lines[:quotation.start(1)] + lines[quotation.end(1):] + return lines[: quotation.start(1)] + lines[quotation.end(1) :] return_flags[:] = [False, -1, -1] return lines @@ -72,32 +72,34 @@ def mark_message_lines(lines): >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) 'tsem' """ - markers = ['e' for _ in lines] + markers = ["e" for _ in lines] i = 0 while i < len(lines): if not lines[i].strip(): - markers[i] = 'e' # empty line + markers[i] = "e" # empty line elif const.QUOT_PATTERN.match(lines[i]): - markers[i] = 'm' # line with quotation marker + markers[i] = "m" # line with quotation marker elif const.RE_FWD.match(lines[i]): - markers[i] = 'f' # ---- Forwarded message ---- + markers[i] = "f" # ---- Forwarded message ---- else: # in case splitter is spread across several lines - splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES])) + splitter = utils.is_splitter( + "\n".join(lines[i : i + const.SPLITTER_MAX_LINES]) + ) if splitter: # append as many splitter markers as lines in splitter splitter_lines = splitter.group().splitlines() for j in range(len(splitter_lines)): - markers[i + j] = 's' + markers[i + j] = "s" # skip splitter lines i += len(splitter_lines) - 1 else: # probably the line from the last message in the conversation - markers[i] = 't' + markers[i] = "t" i += 1 - return ''.join(markers) + return "".join(markers) def _html5lib_parser(): @@ -111,33 +113,30 @@ def _html5lib_parser(): # remove namespace value from inside lxml.html.html5paser element tag # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" # instead of "div", throwing the algo off - namespaceHTMLElements=False + namespaceHTMLElements=False, ) def _rm_excessive_newlines(s): - """Remove excessive newlines that often happen due to tons of divs - """ + """Remove excessive newlines that often happen due to tons of divs""" return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() def _encode_utf8(s): - """Encode in 'utf-8' if unicode - """ - return s.encode('utf-8') if isinstance(s, six.text_type) else s + """Encode in 'utf-8' if unicode""" + return s.encode("utf-8") if isinstance(s, str) else s def html_too_big(s): - if isinstance(s, six.text_type): - s = s.encode('utf8') - return s.count(b'<') > const._MAX_TAGS_COUNT + if isinstance(s, str): + s = s.encode("utf8") + return s.count(b"<") > const._MAX_TAGS_COUNT def html_document_fromstring(s): - """Parse html tree from string. Return None if the string can't be parsed. - """ - if isinstance(s, six.text_type): - s = s.encode('utf8') + """Parse html tree from string. Return None if the string can't be parsed.""" + if isinstance(s, str): + s = s.encode("utf8") try: if html_too_big(s): return None @@ -148,9 +147,9 @@ def html_document_fromstring(s): def html_tree_to_text(tree): - for style in CSSSelector('style')(tree): + for style in CSSSelector("style")(tree): style.getparent().remove(style) - for c in tree.xpath('//comment()'): + for c in tree.xpath("//comment()"): parent = c.getparent() # comment with no parent does not impact produced text if parent is None: @@ -158,16 +157,16 @@ def html_tree_to_text(tree): parent.remove(c) text = "" for el in tree.iter(): - el_text = (el.text or '') + (el.tail or '') + el_text = (el.text or "") + (el.tail or "") if len(el_text) > 1: if el.tag in const._BLOCKTAGS: text += "\n" - if el.tag == 'li': + if el.tag == "li": text += " * " text += el_text.strip() + " " # add href to the output - href = el.attrib.get('href') + href = el.attrib.get("href") if href: text += "(%s) " % href if el.tag in const._HARDBREAKS and text and not text.endswith("\n"): @@ -196,39 +195,42 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if msg_body.strip() == b'': + if msg_body.strip() == b"": return msg_body - msg_body = msg_body.replace(b'\r\n', b'\n') + msg_body = msg_body.replace(b"\r\n", b"\n") html_tree = html_document_fromstring(msg_body) if html_tree is None: return msg_body - cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or - html_quotations.cut_zimbra_quote(html_tree) or - html_quotations.cut_blockquote(html_tree) or - html_quotations.cut_microsoft_quote(html_tree) or - html_quotations.cut_by_id(html_tree) or - html_quotations.cut_from_block(html_tree) - ) + cut_quotations = ( + html_quotations.cut_gmail_quote(html_tree) + or html_quotations.cut_zimbra_quote(html_tree) + or html_quotations.cut_blockquote(html_tree) + or html_quotations.cut_microsoft_quote(html_tree) + or html_quotations.cut_by_id(html_tree) + or html_quotations.cut_from_block(html_tree) + ) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints plain_text = html_tree_to_text(html_tree) - plain_text = utils.preprocess(plain_text, '\n', content_type='text/html') + plain_text = utils.preprocess(plain_text, "\n", content_type="text/html") lines = plain_text.splitlines() # Don't process too long messages if len(lines) > const.MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ - [int(i[4:-4]) # Only checkpoint number - for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] - for line in lines] + [ + int(i[4:-4]) # Only checkpoint number + for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) + ] + for line in lines + ] # Remove checkpoints - lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) - for line in lines] + lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, "", line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) @@ -239,18 +241,17 @@ def _extract_from_html(msg_body): if not lines_were_deleted and not cut_quotations: return msg_body if lines_were_deleted: - #collect checkpoints from deleted lines + # collect checkpoints from deleted lines for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True # Remove tags with quotation checkpoints - html_quotations.delete_quotation_tags( - html_tree_copy, 0, quotation_checkpoints - ) + html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) if _readable_text_empty(html_tree_copy): return msg_body return html.tostring(html_tree_copy) + def extract_from_html(msg_body): """ Extract not quoted message from provided html message body @@ -269,12 +270,12 @@ def extract_from_html(msg_body): Returns a unicode string. """ - if isinstance(msg_body, six.text_type): - msg_body = msg_body.encode('utf8') + if isinstance(msg_body, str): + msg_body = msg_body.encode("utf8") elif not isinstance(msg_body, bytes): - msg_body = msg_body.encode('ascii') + msg_body = msg_body.encode("ascii") result = _extract_from_html(msg_body) if isinstance(result, bytes): - result = result.decode('utf8') + result = result.decode("utf8") return result diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py index 08ce578..b361661 100644 --- a/extract_raw_content/html_quotations.py +++ b/extract_raw_content/html_quotations.py @@ -1,12 +1,11 @@ -import regex as re +import re from lxml.cssselect import CSSSelector - -CHECKPOINT_PREFIX = '#!%!' -CHECKPOINT_SUFFIX = '!%!#' -CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) +CHECKPOINT_PREFIX = "#!%!" +CHECKPOINT_SUFFIX = "!%!#" +CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + r"\d+" + CHECKPOINT_SUFFIX) # HTML quote indicators (tag ids) -QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] +QUOTE_IDS = ["OLK_SRC_BODY_SECTION"] RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) @@ -15,36 +14,34 @@ def cssselect(expr, tree): def add_checkpoint(html_note, counter): - """Recursively adds checkpoints to html tree. - """ + """Recursively adds checkpoints to html tree.""" if html_note.text: - html_note.text = (html_note.text + CHECKPOINT_PREFIX + - str(counter) + CHECKPOINT_SUFFIX) + html_note.text = ( + html_note.text + CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX + ) else: - html_note.text = (CHECKPOINT_PREFIX + str(counter) + - CHECKPOINT_SUFFIX) + html_note.text = CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX counter += 1 for child in html_note.iterchildren(): counter = add_checkpoint(child, counter) if html_note.tail: - html_note.tail = (html_note.tail + CHECKPOINT_PREFIX + - str(counter) + CHECKPOINT_SUFFIX) + html_note.tail = ( + html_note.tail + CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX + ) else: - html_note.tail = (CHECKPOINT_PREFIX + str(counter) + - CHECKPOINT_SUFFIX) + html_note.tail = CHECKPOINT_PREFIX + str(counter) + CHECKPOINT_SUFFIX counter += 1 return counter def delete_quotation_tags(html_note, counter, quotation_checkpoints): - """Deletes tags with quotation checkpoints from html tree. - """ + """Deletes tags with quotation checkpoints from html tree.""" tag_in_quotation = True if quotation_checkpoints[counter]: - html_note.text = '' + html_note.text = "" else: tag_in_quotation = False counter += 1 @@ -52,14 +49,13 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): quotation_children = [] # Children tags which are in quotation. for child in html_note.iterchildren(): counter, child_tag_in_quotation = delete_quotation_tags( - child, counter, - quotation_checkpoints + child, counter, quotation_checkpoints ) if child_tag_in_quotation: quotation_children.append(child) if quotation_checkpoints[counter]: - html_note.tail = '' + html_note.tail = "" else: tag_in_quotation = False counter += 1 @@ -74,29 +70,31 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): - ''' Cuts the outermost block element with class gmail_quote. ''' - gmail_quote = cssselect('div.gmail_quote', html_message) - if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): + """Cuts the outermost block element with class gmail_quote.""" + gmail_quote = cssselect("div.gmail_quote", html_message) + if gmail_quote and ( + gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text) + ): gmail_quote[0].getparent().remove(gmail_quote[0]) return True def cut_microsoft_quote(html_message): - ''' Cuts splitter block and all following blocks. ''' + """Cuts splitter block and all following blocks.""" splitter = html_message.xpath( - #outlook 2007, 2010 (international) + # outlook 2007, 2010 (international) "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" "padding:3.0pt 0cm 0cm 0cm']|" - #outlook 2007, 2010 (american) + # outlook 2007, 2010 (american) "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" "padding:3.0pt 0in 0in 0in']|" - #outlook 2013 (international) + # outlook 2013 (international) "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" "padding:3.0pt 0cm 0cm 0cm']|" - #outlook 2013 (american) + # outlook 2013 (american) "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" "padding:3.0pt 0in 0in 0in']|" - #windows mail + # windows mail "//div[@style='padding-top: 5px; " "border-top-color: rgb(229, 229, 229); " "border-top-width: 1px; border-top-style: solid;']" @@ -104,11 +102,11 @@ def cut_microsoft_quote(html_message): if splitter: splitter = splitter[0] - #outlook 2010 + # outlook 2010 if splitter == splitter.getparent().getchildren()[0]: splitter = splitter.getparent() else: - #outlook 2003 + # outlook 2003 splitter = html_message.xpath( "//div" "/div[@class='MsoNormal' and @align='center' " @@ -138,7 +136,7 @@ def cut_microsoft_quote(html_message): def cut_by_id(html_message): found = False for quote_id in QUOTE_IDS: - quote = cssselect('#{}'.format(quote_id), html_message) + quote = cssselect("#{}".format(quote_id), html_message) if quote: found = True quote[0].getparent().remove(quote[0]) @@ -146,11 +144,12 @@ def cut_by_id(html_message): def cut_blockquote(html_message): - ''' Cuts the last non-nested blockquote with wrapping elements.''' + """Cuts the last non-nested blockquote with wrapping elements.""" quote = html_message.xpath( - '(.//blockquote)' + "(.//blockquote)" '[not(@class="gmail_quote") and not(ancestor::blockquote)]' - '[last()]') + "[last()]" + ) if quote: quote = quote[0] @@ -158,18 +157,12 @@ def cut_blockquote(html_message): return True -def cut_from_block(html_message): - """Cuts div tag which wraps block starting with "From:".""" - # handle the case when From: block is enclosed in some tag - block = html_message.xpath( - ("//*[starts-with(mg:text_content(), 'From:')]|" - "//*[starts-with(mg:text_content(), 'Date:')]")) - +def block_cut_content(block): if block: block = block[-1] parent_div = None while block.getparent() is not None: - if block.tag == 'div': + if block.tag == "div": parent_div = block break block = block.getparent() @@ -178,9 +171,10 @@ def cut_from_block(html_message): # In cases where removing this enclosing div will remove all # content, we should assume the quote is not enclosed in a tag. parent_div_is_all_content = ( - maybe_body is not None and maybe_body.tag == 'body' and - len(maybe_body.getchildren()) == 1) - + maybe_body is not None + and maybe_body.tag == "body" + and len(maybe_body.getchildren()) == 1 + ) if not parent_div_is_all_content: parent = block.getparent() next_sibling = block.getnext() @@ -196,27 +190,44 @@ def cut_from_block(html_message): # From block if no siblings) if block is not None: parent.remove(block) - return True else: return False - # handle the case when From: block goes right after e.g.
- # and not enclosed in some tag - block = html_message.xpath( - ("//*[starts-with(mg:tail(), 'From:')]|" - "//*[starts-with(mg:tail(), 'Date:')]")) + + +def block_cut_tail(block): if block: block = block[0] - if RE_FWD.match(block.getparent().text or ''): + if RE_FWD.match(block.getparent().text or ""): return False - - while(block.getnext() is not None): + + while block.getnext() is not None: block.getparent().remove(block.getnext()) block.getparent().remove(block) return True +def cut_from_block(html_message): + """Cuts div tag which wraps block starting with "From:".""" + # handle the case when From: block is enclosed in some tag + block = html_message.xpath( + "//*[starts-with(mg:text_content(), 'From:')]|" + "//*[starts-with(mg:text_content(), 'Date:')]" + ) + block_content = block_cut_content(block) + if isinstance(block_content, bool): + return block_content + # handle the case when From: block goes right after e.g.
+ # and not enclosed in some tag + block = html_message.xpath( + "//*[starts-with(mg:tail(), 'From:')]|" "//*[starts-with(mg:tail(), 'Date:')]" + ) + block_tail = block_cut_tail(block) + if isinstance(block_tail, bool): + return block_tail + + def cut_zimbra_quote(html_message): zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') if zDivider: diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py index 6023368..e179a49 100644 --- a/extract_raw_content/text.py +++ b/extract_raw_content/text.py @@ -1,4 +1,4 @@ -import regex as re +import re from . import constants as const from . import utils @@ -9,7 +9,7 @@ def get_delimiter(msg_body): if delimiter: delimiter = delimiter.group() else: - delimiter = '\n' + delimiter = "\n" return delimiter @@ -26,33 +26,35 @@ def mark_message_lines(lines): >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) 'tsem' """ - markers = ['e' for _ in lines] + markers = ["e" for _ in lines] i = 0 while i < len(lines): if not lines[i].strip(): - markers[i] = 'e' # empty line + markers[i] = "e" # empty line elif const.QUOT_PATTERN.match(lines[i]): - markers[i] = 'm' # line with quotation marker + markers[i] = "m" # line with quotation marker elif const.RE_FWD.match(lines[i]): - markers[i] = 'f' # ---- Forwarded message ---- + markers[i] = "f" # ---- Forwarded message ---- else: # in case splitter is spread across several lines - splitter = utils.is_splitter('\n'.join(lines[i:i + const.SPLITTER_MAX_LINES])) + splitter = utils.is_splitter( + "\n".join(lines[i : i + const.SPLITTER_MAX_LINES]) + ) if splitter: # append as many splitter markers as lines in splitter splitter_lines = splitter.group().splitlines() for j in range(len(splitter_lines)): - markers[i + j] = 's' + markers[i + j] = "s" # skip splitter lines i += len(splitter_lines) - 1 else: # probably the line from the last message in the conversation - markers[i] = 't' + markers[i] = "t" i += 1 - return ''.join(markers) + return "".join(markers) def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): @@ -66,41 +68,42 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): return_flags = [were_lines_deleted, first_deleted_line, last_deleted_line] """ - markers = ''.join(markers) + markers = "".join(markers) # if there are no splitter there should be no markers - if 's' not in markers and not re.search('(me*){3}', markers): - markers = markers.replace('m', 't') + if "s" not in markers and not re.search("(me*){3}", markers): + markers = markers.replace("m", "t") - if re.match('[te]*f', markers): + if re.match("[te]*f", markers): return_flags[:] = [False, -1, -1] return lines # inlined reply # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' # both 't' entries should be found - for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): + for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers): # long links could break sequence of quotation lines but they shouldn't # be considered an inline reply - links = ( - const.RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or - const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())) + links = const.RE_PARENTHESIS_LINK.search( + lines[inline_reply.start() - 1] + ) or const.RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()) if not links: return_flags[:] = [False, -1, -1] return lines # cut out text lines coming after splitter if there are no markers there - quotation = re.search('(se*)+((t|f)+e*)+', markers) + quotation = re.search("(se*)+((t|f)+e*)+", markers) if quotation: return_flags[:] = [True, quotation.start(), len(lines)] - return lines[:quotation.start()] + return lines[: quotation.start()] # handle the case with markers - quotation = (const.RE_QUOTATION.search(markers) or - const.RE_EMPTY_QUOTATION.search(markers)) + quotation = const.RE_QUOTATION.search(markers) or const.RE_EMPTY_QUOTATION.search( + markers + ) if quotation: return_flags[:] = True, quotation.start(1), quotation.end(1) - return lines[:quotation.start(1)] + lines[quotation.end(1):] + return lines[: quotation.start(1)] + lines[quotation.end(1) :] return_flags[:] = [False, -1, -1] return lines @@ -111,17 +114,15 @@ def postprocess(msg_body): Replace link brackets back to '<' and '>'. """ - return re.sub(const.RE_NORMALIZED_LINK, r'<\1>', msg_body).strip() + return re.sub(const.RE_NORMALIZED_LINK, r"<\1>", msg_body).strip() def extract_from_plain(msg_body): """Extracts a non quoted message from provided plain text.""" - stripped_text = msg_body - delimiter = get_delimiter(msg_body) msg_body = utils.preprocess(msg_body, delimiter) # don't process too long messages - lines = msg_body.splitlines()[:const.MAX_LINES_COUNT] + lines = msg_body.splitlines()[: const.MAX_LINES_COUNT] markers = mark_message_lines(lines) lines = process_marked_lines(lines, markers) diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py index cabc18a..5cce3e4 100644 --- a/extract_raw_content/utils.py +++ b/extract_raw_content/utils.py @@ -1,4 +1,4 @@ -import regex as re +import re from lxml import etree from . import constants as const @@ -11,16 +11,16 @@ def text_content(context): def tail(context): """XPath Extension function to return a node tail text.""" - return context.context_node.tail or '' + return context.context_node.tail or "" def register_xpath_extensions(): ns = etree.FunctionNamespace("http://mailgun.net") - ns.prefix = 'mg' - ns['text_content'] = text_content - ns['tail'] = tail - - + ns.prefix = "mg" + ns["text_content"] = text_content + ns["tail"] = tail + + def _replace_link_brackets(msg_body): """ Normalize links i.e. replace '<', '>' wrapping the link with some symbols @@ -30,36 +30,38 @@ def _replace_link_brackets(msg_body): Converts msg_body into a unicode """ if isinstance(msg_body, bytes): - msg_body = msg_body.decode('utf8') + msg_body = msg_body.decode("utf8") def link_wrapper(link): - newline_index = msg_body[:link.start()].rfind("\n") + newline_index = msg_body[: link.start()].rfind("\n") if msg_body[newline_index + 1] == ">": return link.group() else: return "@@%s@@" % link.group(1) + msg_body = re.sub(const.RE_LINK, link_wrapper, msg_body) return msg_body -def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): +def _wrap_splitter_with_newline(msg_body, delimiter, content_type="text/plain"): """ Splits line in two if splitter pattern preceded by some text on the same line (done only for 'On wrote:' pattern. """ + def splitter_wrapper(splitter): """Wraps splitter with new line""" - if splitter.start() and msg_body[splitter.start() - 1] != '\n': - return '%s%s' % (delimiter, splitter.group()) + if splitter.start() and msg_body[splitter.start() - 1] != "\n": + return "{}{}".format(delimiter, splitter.group()) else: return splitter.group() - if content_type == 'text/plain': + if content_type == "text/plain": msg_body = re.sub(const.RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body) return msg_body -def preprocess(msg_body, delimiter, content_type='text/plain'): +def preprocess(msg_body, delimiter, content_type="text/plain"): """Prepares msg_body for being stripped. Replaces link brackets so that they couldn't be taken for quotation marker. @@ -74,10 +76,10 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): def is_splitter(line): - ''' + """ Returns Matcher object if provided string is a splitter and None otherwise. - ''' + """ for pattern in const.SPLITTER_PATTERNS: matcher = re.match(pattern, line) if matcher: diff --git a/mail_parser.py b/mail_parser.py index b63a97a..432a345 100644 --- a/mail_parser.py +++ b/mail_parser.py @@ -6,14 +6,14 @@ import re import uuid from io import BytesIO + import mailparser from html2text import html2text -from extract_raw_content.text import extract_from_plain from extract_raw_content.html import extract_from_html +from extract_raw_content.text import extract_from_plain from extract_raw_content.utils import register_xpath_extensions - decoder_map = { "base64": base64.b64decode, "": lambda payload: payload.encode("utf-8"), From 754ab2884be3ff8c3a422d14ee2f211b2eff373a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Tue, 23 Apr 2024 11:59:35 +0200 Subject: [PATCH 3/6] Aktualizacja requirements.txt i zmiana wersji (version.py) --- requirements.txt | 34 ++++------------------------------ version.py | 2 +- 2 files changed, 5 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index cf56c79..89e97a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,34 +1,8 @@ -asttokens==2.4.1 -certifi==2024.2.2 -charset-normalizer==3.3.2 -cssselect==1.2.0 -decorator==5.1.1 -exceptiongroup==1.2.0 -executing==2.0.1 html2text==2024.2.26 -html5lib==1.1 -idna==3.7 -ipdb==0.13.13 -ipython==8.23.0 -jedi==0.19.1 -lxml==5.2.1 mail-parser==3.15.0 -matplotlib-inline==0.1.6 -parso==0.8.4 -pexpect==4.9.0 -prompt-toolkit==3.0.43 -ptyprocess==0.7.0 -pure-eval==0.2.2 -Pygments==2.17.2 -regex==2023.12.25 requests==2.31.0 sentry-sdk==1.43.0 -simplejson==3.19.2 -six==1.16.0 -stack-data==0.6.3 -tomli==2.0.1 -traitlets==5.14.2 -typing_extensions==4.11.0 -urllib3==2.2.1 -wcwidth==0.2.13 -webencodings==0.5.1 +ipdb==0.13.13 +html5lib==1.1 +lxml==5.2.1 +cssselect==1.2.0 \ No newline at end of file diff --git a/version.py b/version.py index 38e7dcc..1cd440f 100644 --- a/version.py +++ b/version.py @@ -1,4 +1,4 @@ -__version__ = "1.0.02" +__version__ = "1.1.0" if __name__ == "__main__": print(f"v{__version__}") From 247b138dcec7399150e90bf5162644b711aff382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Tue, 23 Apr 2024 15:10:43 +0200 Subject: [PATCH 4/6] Przeniesienie funkcji _html5lib_parser do pliku utils --- extract_raw_content/html.py | 21 +------- extract_raw_content/utils.py | 100 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 19 deletions(-) diff --git a/extract_raw_content/html.py b/extract_raw_content/html.py index bc1f4b6..a1379d1 100644 --- a/extract_raw_content/html.py +++ b/extract_raw_content/html.py @@ -1,7 +1,6 @@ +import re from copy import deepcopy -import html5lib -import re from lxml import html from lxml.cssselect import CSSSelector from lxml.html import html5parser @@ -102,21 +101,6 @@ def mark_message_lines(lines): return "".join(markers) -def _html5lib_parser(): - """ - html5lib is a pure-python library that conforms to the WHATWG HTML spec - and is not vulnarable to certain attacks common for XML libraries - """ - return html5lib.HTMLParser( - # build lxml tree - html5lib.treebuilders.getTreeBuilder("lxml"), - # remove namespace value from inside lxml.html.html5paser element tag - # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" - # instead of "div", throwing the algo off - namespaceHTMLElements=False, - ) - - def _rm_excessive_newlines(s): """Remove excessive newlines that often happen due to tons of divs""" return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() @@ -140,8 +124,7 @@ def html_document_fromstring(s): try: if html_too_big(s): return None - - return html5parser.document_fromstring(s, parser=_html5lib_parser()) + return html5parser.document_fromstring(s, parser=utils._html5lib_parser()) except Exception: pass diff --git a/extract_raw_content/utils.py b/extract_raw_content/utils.py index 5cce3e4..eb96578 100644 --- a/extract_raw_content/utils.py +++ b/extract_raw_content/utils.py @@ -1,8 +1,18 @@ import re + +import html5lib from lxml import etree +from lxml.cssselect import CSSSelector +from lxml.html import html5parser from . import constants as const +_UTF8_DECLARATION = ( + '' +) +_BLOCKTAGS = ["div", "p", "ul", "li", "h1", "h2", "h3"] +_HARDBREAKS = ["br", "hr", "tr"] + def text_content(context): """XPath Extension function to return a node text content.""" @@ -84,3 +94,93 @@ def is_splitter(line): matcher = re.match(pattern, line) if matcher: return matcher + + +def _html5lib_parser(): + """ + html5lib is a pure-python library that conforms to the WHATWG HTML spec + and is not vulnarable to certain attacks common for XML libraries + """ + return html5lib.HTMLParser( + # build lxml tree + html5lib.treebuilders.getTreeBuilder("lxml"), + # remove namespace value from inside lxml.html.html5paser element tag + # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" + # instead of "div", throwing the algo off + namespaceHTMLElements=False, + ) + + +def _contains_charset_spec(s: str) -> str: + """Return True if the first 4KB contain charset spec""" + return s.lower().find("html; charset=", 0, 4096) != -1 + + +def _rm_excessive_newlines(s: str) -> str: + """Remove excessive newlines that often happen due to tons of divs""" + return const._RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() + + +def _prepend_utf8_declaration(s: str) -> str: + """Prepend 'utf-8' encoding declaration if the first 4KB don't have any""" + return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s + + +def html_fromstring(s: str) -> etree._Element: + """Parse html tree from string. Return None if the string can't be parsed.""" + return html5parser.fromstring(s, parser=_html5lib_parser()) + + +def html_tree_to_text(tree: etree._Element) -> str: + for style in CSSSelector("style")(tree): + style.getparent().remove(style) + + for c in tree.xpath("//comment()"): + parent = c.getparent() + + # comment with no parent does not impact produced text + if parent is None: + continue + + parent.remove(c) + + text = "" + for el in tree.iter(): + el_text = (el.text or "") + (el.tail or "") + if len(el_text) > 1: + if el.tag in _BLOCKTAGS + _HARDBREAKS: + text += "\n" + if el.tag == "li": + text += " * " + text += el_text.strip() + " " + + # add href to the output + href = el.attrib.get("href") + if href: + text += "(%s) " % href + + if el.tag in _HARDBREAKS and text and not text.endswith("\n") and not el_text: + text += "\n" + + text = _rm_excessive_newlines(text) + return text + + +def html_to_text(s: str) -> str | None: + """ + Dead-simple HTML-to-text converter: + >>> html_to_text("one
two
three") + <<< "one\ntwo\nthree" + + NOTES: + 1. the string is expected to contain UTF-8 encoded HTML! + 3. if html can't be parsed returns None + """ + s = _prepend_utf8_declaration(s) + s = s.replace("\n", "") + tree = html_fromstring(s) + + if tree is None: + return None + + return html_tree_to_text(tree) From 83f47c36b98f0d3581d806791df7e4bbaaac0b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Tue, 23 Apr 2024 15:11:23 +0200 Subject: [PATCH 5/6] =?UTF-8?q?Dodanie=20test=C3=B3w=20do=20funkcji=20prze?= =?UTF-8?q?niesionych=20z=20biblioteki=20talon?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extract_raw_content/text.py | 2 +- mails/OLK_SRC_BODY_SECTION.html | 16 + mails/html_replies/gmail.html | 6 + mails/html_replies/hotmail.html | 18 + mails/html_replies/mail_ru.html | 57 + mails/html_replies/ms_outlook_2003.html | 134 +++ mails/html_replies/ms_outlook_2007.html | 42 + mails/html_replies/ms_outlook_2010.html | 87 ++ mails/html_replies/thunderbird.html | 32 + mails/html_replies/windows_mail.html | 33 + mails/html_replies/yandex_ru.html | 1 + mails/reply-quotations-share-block.eml | 22 + mails/reply-separated-by-hr.html | 21 + mails/standard_replies/android.eml | 24 + mails/standard_replies/aol.eml | 65 ++ mails/standard_replies/apple_mail.eml | 15 + mails/standard_replies/apple_mail_2.eml | 19 + mails/standard_replies/comcast.eml | 33 + mails/standard_replies/gmail.eml | 31 + mails/standard_replies/hotmail.eml | 50 + mails/standard_replies/iphone.eml | 19 + mails/standard_replies/iphone_reply_text | 3 + mails/standard_replies/outlook.eml | 85 ++ mails/standard_replies/sparrow.eml | 61 + mails/standard_replies/sparrow_reply_text | 5 + mails/standard_replies/thunderbird.eml | 15 + mails/standard_replies/yahoo.eml | 22 + test.py | 1223 +++++++++++++++++++++ 28 files changed, 2140 insertions(+), 1 deletion(-) create mode 100644 mails/OLK_SRC_BODY_SECTION.html create mode 100644 mails/html_replies/gmail.html create mode 100644 mails/html_replies/hotmail.html create mode 100644 mails/html_replies/mail_ru.html create mode 100644 mails/html_replies/ms_outlook_2003.html create mode 100644 mails/html_replies/ms_outlook_2007.html create mode 100644 mails/html_replies/ms_outlook_2010.html create mode 100644 mails/html_replies/thunderbird.html create mode 100644 mails/html_replies/windows_mail.html create mode 100644 mails/html_replies/yandex_ru.html create mode 100644 mails/reply-quotations-share-block.eml create mode 100644 mails/reply-separated-by-hr.html create mode 100644 mails/standard_replies/android.eml create mode 100644 mails/standard_replies/aol.eml create mode 100644 mails/standard_replies/apple_mail.eml create mode 100644 mails/standard_replies/apple_mail_2.eml create mode 100644 mails/standard_replies/comcast.eml create mode 100644 mails/standard_replies/gmail.eml create mode 100644 mails/standard_replies/hotmail.eml create mode 100644 mails/standard_replies/iphone.eml create mode 100644 mails/standard_replies/iphone_reply_text create mode 100644 mails/standard_replies/outlook.eml create mode 100644 mails/standard_replies/sparrow.eml create mode 100644 mails/standard_replies/sparrow_reply_text create mode 100644 mails/standard_replies/thunderbird.eml create mode 100644 mails/standard_replies/yahoo.eml diff --git a/extract_raw_content/text.py b/extract_raw_content/text.py index e179a49..4e4e4a1 100644 --- a/extract_raw_content/text.py +++ b/extract_raw_content/text.py @@ -80,7 +80,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): # inlined reply # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' # both 't' entries should be found - for inline_reply in re.finditer("(?<=m)e*((?:t+e*)+)m", markers): + for inline_reply in re.finditer("(?<=m)e*(t[te]*)m", markers): # long links could break sequence of quotation lines but they shouldn't # be considered an inline reply links = const.RE_PARENTHESIS_LINK.search( diff --git a/mails/OLK_SRC_BODY_SECTION.html b/mails/OLK_SRC_BODY_SECTION.html new file mode 100644 index 0000000..82f3689 --- /dev/null +++ b/mails/OLK_SRC_BODY_SECTION.html @@ -0,0 +1,16 @@ + + +
Reply
+ +
+ From: Bob <bob@example.com>
+ Date: Tue, 01 Nov 2011 18:54:39 -0700
+ To: Rob <rob@example.com>
+ Subject: Test
+
+
+ Hi +
+
+ + diff --git a/mails/html_replies/gmail.html b/mails/html_replies/gmail.html new file mode 100644 index 0000000..7bc7cf5 --- /dev/null +++ b/mails/html_replies/gmail.html @@ -0,0 +1,6 @@ +
Hi. I am fine.

Thanks,
Alex
+


On Thu, Jun 26, 2014 at 2:14 PM, Alexander L <a@example.com> wrote:
+
+Hello! How are you?

+
Thanks,
Sasha.
+

diff --git a/mails/html_replies/hotmail.html b/mails/html_replies/hotmail.html new file mode 100644 index 0000000..0257b2e --- /dev/null +++ b/mails/html_replies/hotmail.html @@ -0,0 +1,18 @@ + + + + +
Hi. I am fine.

Thanks,
Alex


Date: Thu, 26 Jun 2014 13:53:45 +0400
Subject: Test message
From: abc@example.com
To: alex.l@example.com

Hello! How are you?

Thanks,
+Sasha.
+ diff --git a/mails/html_replies/mail_ru.html b/mails/html_replies/mail_ru.html new file mode 100644 index 0000000..52d7039 --- /dev/null +++ b/mails/html_replies/mail_ru.html @@ -0,0 +1,57 @@ + +

Hi. I am fine.

Thanks,
Alex




Thu, 26 Jun 2014 14:00:51 +0400 от Alexander L <abc@example.com>:
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
Hello! How are you?

+
Thanks,
Sasha.
+ +
+ + + +
+ + +
+ + +
+
+
diff --git a/mails/html_replies/ms_outlook_2003.html b/mails/html_replies/ms_outlook_2003.html new file mode 100644 index 0000000..027c525 --- /dev/null +++ b/mails/html_replies/ms_outlook_2003.html @@ -0,0 +1,134 @@ + + + + + + + + + + + + +
+ +

Hi. I am fine.

+ +

 

+ +

Thanks,

+ +

Alex

+ +

 

+ +
+ +
+ +
+ +
+ +

From: +Alexander L [mailto:abc@example.com]
+Sent: Friday, June 27, 2014 12:06 +PM
+To: Alexander
+Subject: Test message

+ +
+ +

 

+ +
+ +
+ +
+ +

Hello! How are you?

+ +
+ +
+ +

 

+ +
+ +
+ +

Thanks,

+ +
+ +
+ +

Sasha.

+ +
+ +
+ +
+ +
+ + + + diff --git a/mails/html_replies/ms_outlook_2007.html b/mails/html_replies/ms_outlook_2007.html new file mode 100644 index 0000000..84f7ede --- /dev/null +++ b/mails/html_replies/ms_outlook_2007.html @@ -0,0 +1,42 @@ +

Hi. I am fine.

Thanks,

Alex

 

From: Alexander L [mailto:abc@example.com]
Sent: Thursday, July 03, 2014 3:50 PM
To: alex.l@example.com
Subject: Test message

 

Hello! How are you?

 

Thanks,

Sasha.

diff --git a/mails/html_replies/ms_outlook_2010.html b/mails/html_replies/ms_outlook_2010.html new file mode 100644 index 0000000..9d26d0e --- /dev/null +++ b/mails/html_replies/ms_outlook_2010.html @@ -0,0 +1,87 @@ + + + + + + + +
+

Hi. I am fine.

+

Thanks,

+

Alex

+

From: Foo [mailto:foo@bar.com] +On Behalf Of baz@bar.com
+Sent: Monday, January 01, 2000 12:00 AM
+To: john@bar.com
+Cc: jane@bar.io
+Subject: Conversation

+

 

+

Hello! How are you?

+

 

+
+ + diff --git a/mails/html_replies/thunderbird.html b/mails/html_replies/thunderbird.html new file mode 100644 index 0000000..9d871cb --- /dev/null +++ b/mails/html_replies/thunderbird.html @@ -0,0 +1,32 @@ + + + + + + Hi. I am fine.
+
+ Thanks,
+ Alex
+
On 26.06.2014 14:41, Alexander L + wrote:
+
+
+
+
+
Hello! How are you?
+

+
+
Thanks,
+
Sasha.
+
+
+
+
+ + diff --git a/mails/html_replies/windows_mail.html b/mails/html_replies/windows_mail.html new file mode 100644 index 0000000..9100ea1 --- /dev/null +++ b/mails/html_replies/windows_mail.html @@ -0,0 +1,33 @@ + + + + + + +
Hi. I am fine.

Thanks,
Alex


От: Alexander L
Отправлено: ‎четверг‎, ‎26‎ ‎июня‎ ‎2014‎ г. ‎15‎:‎05
Кому: Alex

Hello! How are you?

+
Thanks,
Sasha.
+
+ + diff --git a/mails/html_replies/yandex_ru.html b/mails/html_replies/yandex_ru.html new file mode 100644 index 0000000..3847fb9 --- /dev/null +++ b/mails/html_replies/yandex_ru.html @@ -0,0 +1 @@ +

Hi. I am fine.

Thanks,
Alex

26.06.2014, 14:41, "Alexander L" <abc@example.com>:

Hello! How are you?

Thanks,
Sasha.
diff --git a/mails/reply-quotations-share-block.eml b/mails/reply-quotations-share-block.eml new file mode 100644 index 0000000..1b9a21a --- /dev/null +++ b/mails/reply-quotations-share-block.eml @@ -0,0 +1,22 @@ +Content-Type: multipart/alternative; + boundary="===============6853056845739363347==" +MIME-Version: 1.0 +Date: Wed, 4 Apr 2012 22:22:42 -0700 (PDT) +From: Joe Doe +Subject: Re: You've got a new booking inquiry! + +--===============6853056845739363347== +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 + +SGkgS2F0aGFyaW5lLsKgIFNvdW5kcyBncmVhdC7CoCBBcmUgdGhlcmUgYW5kIGRpZXRyeSByZXN0cmljdGlvbnMgb3IgdGhpbmdzIHlvdXIgaHVzYmFuZCBkb2VzL2RvZXNuJ3QgbGlrZSB0byBlYXQ/wqAgV291bGQgeW91IGxpa2UgdG8gZG8gYSBmZXcgaG9ycyBkIG9ldXZyZXMgYW5kIHRoZW4gYcKgMyBvciA0wqBjb3Vyc2UgZGlubmVyP8KgIExldCBtZSBrbm93IHdoYXQgeW91IHRoaW5rIHdpbGwgd29yayBiZXN0IGFuZCBJIHdpbGwgc3RhcnQgd29ya2luZyBvbiBhIG1lbnUgYW5kIHByb3Bvc2FsLsKgIFRoYW5rcyBzbyBtdWNoIGFuZCBsb29rIGZvcndhcmQgdG8gaGVhcmluZyBmcm9tIHlvdSBzb29uLgrCoApKb2UgWFhYCgotLS0gT24gV2VkLCA0LzQvMTIsIHh4eEBleGFtcGxlLmNvbSA8eHh4QGV4YW1wbGUuY29tPiB3cm90ZToKCgpGcm9tOiB4eHhAZXhhbXBsZS5jb20gPHh4eEBleGFtcGxlLmNvbT4KU3ViamVjdDogWW91J3ZlIGdvdCBhIG5ldyBib29raW5nIGlucXVpcnkhClRvOiB4eHhAeWFob28uY29tCkRhdGU6IFdlZG5lc2RheSwgQXByaWwgNCwgMjAxMiwgMTA6MjMgUE0KCk5ldyBCb29raW5nIElucXVpcnkKCg== + +--===============6853056845739363347== +MIME-Version: 1.0 +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: base64 + +PHRhYmxlPjx0cj48dGQ+PERJVj5IaSBLYXRoYXJpbmUuJm5ic3A7IFNvdW5kcyBncmVhdC4mbmJzcDsgQXJlIHRoZXJlIGFuZCBkaWV0cnkgcmVzdHJpY3Rpb25zIG9yIHRoaW5ncyB5b3VyIGh1c2JhbmQgZG9lcy9kb2Vzbid0IGxpa2UgdG8gZWF0PyZuYnNwOyBXb3VsZCB5b3UgbGlrZSB0byBkbyBhIGZldyBob3JzIGQgb2V1dnJlcyBhbmQgdGhlbiBhJm5ic3A7MyBvciA0Jm5ic3A7Y291cnNlIGRpbm5lcj8mbmJzcDsgTGV0IG1lIGtub3cgd2hhdCB5b3UgdGhpbmsgd2lsbCB3b3JrIGJlc3QgYW5kIEkgd2lsbCBzdGFydCB3b3JraW5nIG9uIGEgbWVudSBhbmQgcHJvcG9zYWwuJm5ic3A7IFRoYW5rcyBzbyBtdWNoIGFuZCBsb29rIGZvcndhcmQgdG8gaGVhcmluZyBmcm9tIHlvdSBzb29uLjwvRElWPgo8RElWPiZuYnNwOzwvRElWPgo8RElWPkpob24gRG9lPEJSPjxCUj4tLS0gT24gPEI+V2VkLCA0LzQvMTIsIHh4eEBleGFtcGxlLmNvbSA8ST4mbHQ7eHh4QGV4YW1wbGUuY29tJmd0OzwvST48L0I+IHdyb3RlOjxCUj48L0RJVj4KPEJMT0NLUVVPVEU+PEJSPkZyb206IHh4eEBleGFtcGxlLmNvbSAmbHQ7eHh4QGV4YW1wbGUuY29tJmd0OzxCUj5TdWJqZWN0OiBZb3UndmUgZ290IGEgbmV3IGJvb2tpbmcgaW5xdWlyeSE8QlI+VG86IHh4eEB5YWhvby5jb208QlI+RGF0ZTogV2VkbmVzZGF5LCBBcHJpbCA0LCAyMDEyLCAxMDoyMyBQTTxCUj48QlI+CjxESVY+CjxESVY+CjxDRU5URVI+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPgo8VEFCTEU+CjxUQk9EWT4KPFRSPgo8VEQ+CjxESVY+TmV3IEJvb2tpbmcgSW5xdWlyeSA8L0RJVj48L1REPgo8VEQ+CjxESVY+WW91ciBwbGFjZSBpcyB0aGUgaG9tZSBvZiBiZXNwb2tlIGRpbmluZyA8L0RJVj48L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPiA8L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj4KPFRSPgo8VEQ+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPFRBQkxFPgo8VEJPRFk+CjxUUj4KPFREPgo8RElWPjxCUj5Hb29kIE5ld3MhPEJSPjxCUj4KPFA+RXZlbnQgRGV0YWlsczwvRElWPkRhdGU6IEFwcmlsIDI4LCAyMDEyPEJSPkxvY2F0aW9uOiB4eHg8QlI+SGVhZGNvdW50OiA2IHRvIDg8QlI+VGFyZ2V0IEJ1ZGdldDogJDUwIHBlciBwZXJzb248QlI+PEJSPkJlc3QgRGVzY3JpcHRpb24gb2YgVGFyZ2V0IEJ1ZGdldDogSSdkIGxvdmUgdG8gaGVhciB3aGF0IHRoZSBjaGVmIHRoaW5rcyBpcyBiZXN0IGZvciBteSBldmVudCwgcHJvdmlkZWQgd2Ugc3RheSBjbG9zZSB0byB0aGlzIGJ1ZGdldCA8QlI+PEJSPkV2ZW50IERlc2NyaXB0aW9uOiBJIGFtIHdhbnRpbmcgdG8gc3VycHJpc2UgbXkgaHVzYmFuZCB3aXRoIGEgY2FzdWFsIGRpbm5lciBwYXJ0eSBpbiBvdXIgaG9tZSBpbiB4eHguIFdlIGhhdmUgYW4gYW1hemluZyBraXRjaGVuICh0aGF0IEkgZG9uJ3QgZG8ganVzdGljZSB0byBidXQgSSBiZXQgeW91IGNvdWxkISksIGFuZCBhIHJlYWxseSBuaWNlIGdhcmRlbiBmb3IgZGluaW5nLiBJIGFtIGZseWluZyBzb21lIG9mIGhpcyBiZXN0IGZyaWVuZHMgaW4gdG8gY2VsZWJyYXRlIGhpbS4gV2UgaGF2ZSBzbWFsbCBraWRzICh3aG8gd2lsbCBiZSBzbGVlcGluZyEpLCBzbyBJJ20gaG9waW5nIGZvciBhIGNhc3VhbCBidXQgcm9tYW50aWMgZGlubmVyIHBhcnR5LiA8QlI+PEJSPlZpZXcgbW9yZSBpbnF1aXJ5IGRldGFpbHMgb24geW91ciBFdmVudCBEYXNoYm9hcmQuIElmIHlvdSBsaWtlIHdoYXQgeW91IHNlZSwgcGxlYXNlIGNyZWF0ZSBhIHByb3Bvc2FsIGZvciB0aGUgZXZlbnQuIDxCUj48QlI+SWYgeW91IGRvIG5vdCBoYXZlIHRoZSB0aW1lIHRvIG1ha2UgYSBmdWxsIHByb3Bvc2FsIHJpZ2h0IG5vdywgd2UgZW5jb3VyYWdlIHlvdSB0byBhdCBsZWFzdCByZXNwb25kIHRvIHRoZSBob3N0IHdpdGggYSBxdWljayBtZXNzYWdlIHRvIGNvbmZpcm0gdGhhdCB5b3UndmUgZ290dGVuIHRoaXMgaW5xdWlyeSBhbmQgaGF2ZSBiZWd1biB0aGlua2luZyBhYm91dCB0aGUgZXZlbnQuIDxCUj48QlI+PFNUUk9ORz5Zb3UgY2FuIHJlcGx5IGRpcmVjdGx5IHRvIHRoaXMgZW1haWwgYW5kIHlvdXIgbWVzc2FnZSB3aWxsIGdvIHRvIHRoZSBob3N0IG9uIHRoZSBldmVudCBkYXNoYm9hcmQuPC9TVFJPTkc+IDxCUj48QlI+UmVtZW1iZXIsIHlvdSBoYXZlIGV4Y2x1c2l2ZSBhY2Nlc3MgdG8gdGhpcyBpbnF1aXJ5IGZvciB0aGUgbmV4dCAyNCBob3Vycy4gUGxlYXNlIG1ha2UgYSBwcm9wb3NhbCBvciBzZW5kIGEgbWVzc2FnZSB0byB0aGUgaG9zdCBpbiB0aGF0IHRpbWUuIElmIHRoZSBob3N0IGhhcyBub3QgaGVhcmQgYW55dGhpbmcgZnJvbSB5b3UgaW4gMjQgaG91cnMsIHdlIHdpbGwKIGZvcndhcmQgdGhlIGhvc3RzIGlucXVpcnkgdG8gYSBzbWFsbCBudW1iZXIgb2YgYWRkaXRpb25hbCBjaGVmcywgYW5kIHRoZXkgd2lsbCBoYXZlIHRoZSBvcHBvcnR1bml0eSB0byBtYWtlIGEgcHJvcG9zYWwuIFdlIGRvIHRoaXMgYXMgYSBjb3VydGVzeSB0byB0aGUgaG9zdHMuIDxCUj48QlI+SWYgeW91IGNhbm5vdCBhY2NlcHQgdGhpcyBib29raW5nIG9yIGRvIG5vdCB3YW50IHRvIGZvciBhbnkgcmVhc29uLCBwbGVhc2UgdGFrZSB0aGUgdGltZSB0byBkZWNsaW5lIG9uIHRoZSBFdmVudCBEYXNoYm9hcmQuIDxCUj48QlI+VGltZSB0byBnZXQgY29va2luJyA8QlI+PEJSPjwvRElWPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48L1REPjwvVFI+CjxUUj4KPFREPgo8VEFCTEU+CjxUQk9EWT4KPFRSPgo8VEQ+CjxUQUJMRT4KPFRCT0RZPgo8VFI+CjxURD4KPERJVj4mbmJzcDs8QSBocmVmPSJodHRwOi8vZXhhbXBsZS5jb20iPmZvbGxvdyBvbiBUd2l0dGVyPC9BPiB8IDxBIGhyZWY9Imh0dHA6Ly94eHgiPmZyaWVuZCBvbiBGYWNlYm9vazwvQT4gfCA8QQogaHJlZj0iaHR0cDovL2V4YW1wbGUuY29tIj5Gb3J3YXJkIHRvIGEgRnJpZW5kPC9BPiZndDsmbmJzcDsgPC9ESVY+PC9URD48L1RSPgo8VFI+CjxURD4KPERJVj48RU0+Q29weXJpZ2g8L0VNPiA8L0RJVj48L1REPjwvVFI+PC9UQk9EWT48L1RBQkxFPjwvVEQ+PC9UUj48L1RCT0RZPjwvVEFCTEU+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48QlI+PC9URD48L1RSPjwvVEJPRFk+PC9UQUJMRT48L0NFTlRFUj48SU1HIGFsdD0iIiBzcmM9Imh0dHA6Ly9leGFtcGxlLmNvbSI+IDwvRElWPjwvRElWPjwvQkxPQ0tRVU9URT48L3RkPjwvdHI+PC90YWJsZT4K + +--===============6853056845739363347==-- diff --git a/mails/reply-separated-by-hr.html b/mails/reply-separated-by-hr.html new file mode 100644 index 0000000..0d6eabb --- /dev/null +++ b/mails/reply-separated-by-hr.html @@ -0,0 +1,21 @@ + + +
+ Hi +
+ there +
+
+ Bob +
+ From: bob@example.com
+ To: xxx@comcast.net
+ Sent: Friday, July 22, 2011 6:20:01 PM
+ Subject: Hello

+

+ Hello +

+
+
+ + diff --git a/mails/standard_replies/android.eml b/mails/standard_replies/android.eml new file mode 100644 index 0000000..bf7cb33 --- /dev/null +++ b/mails/standard_replies/android.eml @@ -0,0 +1,24 @@ +Content-Type: multipart/alternative; + boundary="===============0934372227844987316==" +MIME-Version: 1.0 +Date: Mon, 2 Apr 2012 18:22:10 +0400 +Message-Id: +Subject: Re: Test +From: Sergey Obykhov +To: "bob@xxx.mailgun.org" + +--===============0934372227844987316== +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 + +SGVsbG8KMDIuMDQuMjAxMiAxNDoyMCDQv9C+0LvRjNC30L7QstCw0YLQtdC70YwgImJvYkB4eHgubWFpbGd1bi5vcmciIDwKYm9iQHh4eC5tYWlsZ3VuLm9yZz4g0L3QsNC/0LjRgdCw0Ls6Cgo+IEhpCj4KCg== + +--===============0934372227844987316== +MIME-Version: 1.0 +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: base64 + +PHA+SGVsbG88L3A+CjxkaXYgY2xhc3M9ImdtYWlsX3F1b3RlIj4wMi4wNC4yMDEyIDE0OjIwINC/0L7Qu9GM0LfQvtCy0LDRgtC10LvRjCAmcXVvdDs8YSBocmVmPSJtYWlsdG86Ym9iQHh4eC5tYWlsZ3VuLm9yZyI+Ym9iQHh4eC5tYWlsZ3VuLm9yZzwvYT4mcXVvdDsgJmx0OzxhIGhyZWY9Im1haWx0bzpib2JAeHh4Lm1haWxndW4ub3JnIj5ib2JAeHh4Lm1haWxndW4ub3JnPC9hPiZndDsg0L3QsNC/0LjRgdCw0Ls6PGJyIHR5cGU9ImF0dHJpYnV0aW9uIj4KPGJsb2NrcXVvdGUgY2xhc3M9ImdtYWlsX3F1b3RlIiBzdHlsZT0ibWFyZ2luOjAgMCAwIC44ZXg7Ym9yZGVyLWxlZnQ6MXB4ICNjY2Mgc29saWQ7cGFkZGluZy1sZWZ0OjFleCI+SGk8YnI+CjwvYmxvY2txdW90ZT48L2Rpdj4KCg== + +--===============0934372227844987316==-- diff --git a/mails/standard_replies/aol.eml b/mails/standard_replies/aol.eml new file mode 100644 index 0000000..340d3c2 --- /dev/null +++ b/mails/standard_replies/aol.eml @@ -0,0 +1,65 @@ +Content-Type: multipart/alternative; + boundary="===============7429987408351918371==" +MIME-Version: 1.0 +To: bob@example.com +Subject: Re: Test +From: Megan Odin +Message-Id: <8CEDEEFBEF4733B-1E5C-73DF@webmail-d070.sysops.aol.com> +Date: Mon, 2 Apr 2012 09:57:58 -0400 (EDT) + +--===============7429987408351918371== +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + +Hello + + + +-----Original Message----- +From: bob +To: xxx ; xxx ; xxx ; xxx ; xxx ; xxx +Sent: Mon, Apr 2, 2012 5:49 pm +Subject: Test + + +Hi + + + +--===============7429987408351918371== +Content-Type: text/html; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + +Hello
+ +
+
+ +
-----Original Message-----
+From: bob <bob@example.com>
+To: xxx <xxx@gmail.com>; xxx <xxx@hotmail.com>; xxx <xxx@yahoo.com>; xxx <xxx@aol.com>; xxx <xxx@comcast.net>; xxx <xxx@nyc.rr.com>
+Sent: Mon, Apr 2, 2012 5:49 pm
+Subject: Test
+ +
+ + + + + + + +
+ +
Hi
+
+
+ + + + +
+
+--===============7429987408351918371==-- diff --git a/mails/standard_replies/apple_mail.eml b/mails/standard_replies/apple_mail.eml new file mode 100644 index 0000000..1adbc3b --- /dev/null +++ b/mails/standard_replies/apple_mail.eml @@ -0,0 +1,15 @@ +Content-Type: text/plain; charset=iso-8859-1 +Mime-Version: 1.0 (Apple Message framework v1257) +Subject: Re: Test +From: xxx +Date: Tue, 3 Apr 2012 16:55:26 +0400 +Content-Transfer-Encoding: 7bit +Message-Id: <9A1EA6A5-4FD3-4AD0-8DFD-2420E670DB53@gmail.com> +To: bob +X-Mailer: Apple Mail (2.1257) + +Hello + +On Apr 3, 2012, at 4:19 PM, bob wrote: + +> Hi diff --git a/mails/standard_replies/apple_mail_2.eml b/mails/standard_replies/apple_mail_2.eml new file mode 100644 index 0000000..a030311 --- /dev/null +++ b/mails/standard_replies/apple_mail_2.eml @@ -0,0 +1,19 @@ +Content-Type: text/plain; + charset=us-ascii +Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\)) +Subject: Re: Hello there +X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4 +From: Adam Renberg +In-Reply-To: +Date: Sat, 22 Aug 2015 19:22:20 +0200 +Content-Transfer-Encoding: 7bit +X-Smtp-Server: smtp.gmail.com:adam@tictail.com +Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com> +References: +To: Adam Renberg + +Hello +> On 22 Aug 2015, at 19:21, Adam Renberg wrote: +> +> Hi there! + diff --git a/mails/standard_replies/comcast.eml b/mails/standard_replies/comcast.eml new file mode 100644 index 0000000..fe25d73 --- /dev/null +++ b/mails/standard_replies/comcast.eml @@ -0,0 +1,33 @@ +Content-Type: multipart/alternative; + boundary="===============3552566137977633461==" +MIME-Version: 1.0 +Date: Mon, 2 Apr 2012 13:56:12 +0000 (UTC) +From: xxx@comcast.net +To: bob@xxx.mailgun.org +Message-Id: <650787974.741595.1333374972389.JavaMail.root@sz0152a.westchester.pa.mail.comcast.net> +Subject: Re: Test +X-Mailer: Zimbra 6.0.13_GA_2944 (ZimbraWebClient - SAF3 (Linux)/6.0.13_GA_2944) + +--===============3552566137977633461== +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit + +Hello + +----- Original Message ----- +From: bob@xxx.mailgun.org +To: xxx@gmail.com, xxx@hotmail.com, xxx@yahoo.com, xxx@aol.com, xxx@comcast.net, lsloan6@nyc.rr.com +Sent: Monday, April 2, 2012 5:44:22 PM +Subject: Test + +Hi + +--===============3552566137977633461== +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit + +
Hello


From: bob@xxx.mailgun.org
To: xxx@gmail.com, xxx@hotmail.com, xxx@yahoo.com, xxx@aol.com, xxx@comcast.net, lsloan6@nyc.rr.com
Sent: Monday, April 2, 2012 5:44:22 PM
Subject: Test

Hi
+--===============3552566137977633461==-- + diff --git a/mails/standard_replies/gmail.eml b/mails/standard_replies/gmail.eml new file mode 100644 index 0000000..99ece5b --- /dev/null +++ b/mails/standard_replies/gmail.eml @@ -0,0 +1,31 @@ +Content-Type: multipart/alternative; + boundary="===============3455449757443551301==" +MIME-Version: 1.0 +Date: Mon, 2 Apr 2012 20:21:52 +0400 +Message-Id: +Subject: Re: Test +From: Megan One +To: bob@example.com + +--===============3455449757443551301== +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit + +Hello + +On Mon, Apr 2, 2012 at 6:26 PM, Megan One wrote: + +> Hi + +--===============3455449757443551301== +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit + +Hello

On Mon, Apr 2, 2012 at 6:26 PM, Megan One <xxx@gmail.com> wrote:
+Hi + +

+ +--===============3455449757443551301==-- diff --git a/mails/standard_replies/hotmail.eml b/mails/standard_replies/hotmail.eml new file mode 100644 index 0000000..636f218 --- /dev/null +++ b/mails/standard_replies/hotmail.eml @@ -0,0 +1,50 @@ +Content-Type: multipart/alternative; + boundary="===============5499446768842282638==" +MIME-Version: 1.0 +Message-Id: +From: Alexey Q +To: +Subject: RE: Test +Date: Mon, 2 Apr 2012 21:47:37 +0800 +X-Originalarrivaltime: 02 Apr 2012 13:47:37.0935 (UTC) + FILETIME=[2A6C0DF0:01CD10D7] + +--===============5499446768842282638== +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit + + +Hello + +> Subject: Test +> From: bob@xxx.mailgun.org +> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com +> Date: Mon, 2 Apr 2012 17:44:22 +0400 +> +> Hi + +--===============5499446768842282638== +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit + + + + +
+Hello

> Subject: Test
> From: bob@xxx.mailgun.org
> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> Date: Mon, 2 Apr 2012 17:44:22 +0400
>
> Hi
+ + +--===============5499446768842282638==-- diff --git a/mails/standard_replies/iphone.eml b/mails/standard_replies/iphone.eml new file mode 100644 index 0000000..320f8ac --- /dev/null +++ b/mails/standard_replies/iphone.eml @@ -0,0 +1,19 @@ +Subject: Re: Test +From: xxx +Content-Type: text/plain; + charset=us-ascii +X-Mailer: iPhone Mail (9B176) +Message-Id: <06C90B12-13B9-4C5F-A9EF-4A809D94C078@gmail.com> +Date: Tue, 3 Apr 2012 16:23:59 +0400 +To: bob +Content-Transfer-Encoding: quoted-printable +Mime-Version: 1.0 (1.0) + +Hello + +Sent from my iPhone + +On Apr 3, 2012, at 4:19 PM, bob wr= +ote: + +> Hi diff --git a/mails/standard_replies/iphone_reply_text b/mails/standard_replies/iphone_reply_text new file mode 100644 index 0000000..460d6d7 --- /dev/null +++ b/mails/standard_replies/iphone_reply_text @@ -0,0 +1,3 @@ +Hello + +Sent from my iPhone diff --git a/mails/standard_replies/outlook.eml b/mails/standard_replies/outlook.eml new file mode 100644 index 0000000..674828f --- /dev/null +++ b/mails/standard_replies/outlook.eml @@ -0,0 +1,85 @@ +Subject: Test +From: me@example.com +To: you@example.com +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary=0016364c440b2e8b63049acd5370 +X-Mailgun-Tag: tag +X-Mailgun-Mailing-List-Id: 1q + +--0016364c440b2e8b63049acd5370 +Content-Type: text/plain; charset=ISO-8859-1 + +Hello + +From: xxx@xxx.mailgun.org [mailto:xxx@xxx.mailgun.org] +Sent: March-09-12 4:22 PM +To: Dan Le +Subject: The manager has commented on your Loop + +Hi dan.le@example.com, + +The manager's comment: +"Hello Allan! Did you ask for some MIME? " + +Loop details: + +xxx at Dan +I'm not happy +"" + +Your Loop is here. + +We will be in touch again with any further updates, + +xxx + +If you did not sign up to receive emails from us you can use the link below to unsubscribe. We apologize for any inconvenience. + +Unsubscribe + +--0016364c440b2e8b63049acd5370 +Content-Type: text/html; charset=ISO-8859-1 + +

Allo! Follow up MIME!

 

From: xxx@xxx.mailgun.org [mailto:xxx@xxx.mailgun.org]
Sent: March-09-12 4:22 PM
To: Dan Le
Subject: The manager has commented on your Loop

 

Hi dan.le@example.com,

The manager's comment:
"Hello Allan! Did you ask for some MIME? "

Loop details:

xxx at Dan
I'm not happy
""

Your Loop is here.

We will be in touch again with any further updates,

xxx

If you did not sign up to receive emails from us you can use the link below to unsubscribe. We apologize for any inconvenience.

Unsubscribe

+ +--0016364c440b2e8b63049acd5370-- \ No newline at end of file diff --git a/mails/standard_replies/sparrow.eml b/mails/standard_replies/sparrow.eml new file mode 100644 index 0000000..b2a510d --- /dev/null +++ b/mails/standard_replies/sparrow.eml @@ -0,0 +1,61 @@ +Date: Tue, 3 Apr 2012 16:58:35 +0400 +From: xxx +To: bob +Message-ID: <5BB86EF4B6E24E4C9DA4BBEF59DA9809@gmail.com> +Subject: Re: Test +X-Mailer: sparrow 1.5 (build 1043) +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="4f7af3fb_749abb43_300" + +--4f7af3fb_749abb43_300 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit +Content-Disposition: inline + +Hello + +-- +xxx +Sent with Sparrow (http://www.sparrowmailapp.com/?sig) + + +On Tuesday, April 3, 2012 at 4:55 PM, xxx wrote: + +> Hello +> +> On Apr 3, 2012, at 4:19 PM, bob wrote: +> +> > Hi + + +--4f7af3fb_749abb43_300 +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: quoted-printable +Content-Disposition: inline + + +
+ Hello +
+

-- 
xx= +x
Sent with Sparrow

+ =20 +

On Tuesday, April 3, 2= +012 at 4:55 PM, xxx wrote:

+
+
Hello

O= +n Apr 3, 2012, at 4:19 PM, bob wrote:

Hi
+ =20 + =20 + =20 + =20 + + =20 +
+
+
+ +--4f7af3fb_749abb43_300-- diff --git a/mails/standard_replies/sparrow_reply_text b/mails/standard_replies/sparrow_reply_text new file mode 100644 index 0000000..0a8f078 --- /dev/null +++ b/mails/standard_replies/sparrow_reply_text @@ -0,0 +1,5 @@ +Hello + +-- +xxx +Sent with Sparrow (http://www.sparrowmailapp.com/?sig) \ No newline at end of file diff --git a/mails/standard_replies/thunderbird.eml b/mails/standard_replies/thunderbird.eml new file mode 100644 index 0000000..e74e69d --- /dev/null +++ b/mails/standard_replies/thunderbird.eml @@ -0,0 +1,15 @@ +MIME-Version: 1.0 +Message-Id: <4F79B73C.9030506@xxx.mailgun.org> +Date: Mon, 02 Apr 2012 18:27:08 +0400 +From: bob +User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; + rv:1.9.2.28) Gecko/20120313 Thunderbird/3.1.20 +To: Megan One +Subject: Re: Test +Sender: bob@xxx.mailgun.org +Content-Type: text/plain; charset="us-ascii"; format="flowed" +Content-Transfer-Encoding: 7bit + +On 04/02/2012 06:26 PM, Megan One wrote: +> Hi +Hello \ No newline at end of file diff --git a/mails/standard_replies/yahoo.eml b/mails/standard_replies/yahoo.eml new file mode 100644 index 0000000..4969255 --- /dev/null +++ b/mails/standard_replies/yahoo.eml @@ -0,0 +1,22 @@ +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +X-Mailer: YahooMailWebService/0.8.117.340979 +Message-Id: <1333374330.68772.YahooMailNeo@web114411.mail.gq1.yahoo.com> +Date: Mon, 2 Apr 2012 06:45:30 -0700 (PDT) +From: Alex Q +Subject: Re: Test +To: "bob@xxx.mailgun.org" +In-Reply-To: <1333374262.7063.15.camel@mg5> +Content-Transfer-Encoding: 7bit + +Hello + + +----- Original Message ----- +From: "bob@xxx.mailgun.org" +To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com +Cc: +Sent: Monday, April 2, 2012 5:44 PM +Subject: Test + +Hi diff --git a/test.py b/test.py index c6a1c7d..703e349 100644 --- a/test.py +++ b/test.py @@ -1,9 +1,16 @@ import json import os +import re import unittest +from unittest.mock import Mock, patch +from extract_raw_content import constants, html, text, utils from mail_parser import serialize_mail +STANDARD_REPLIES = "mails/standard_replies" +RE_WHITESPACE = re.compile(r"\s") +RE_DOUBLE_WHITESPACE = re.compile(r"\s") + def get_email_as_bytes(name): with open( @@ -42,6 +49,1222 @@ def test_html_only(self): self.assertTrue(manifest["text"]["content"]) self.assertTrue(manifest["text"]["html_content"]) + def test_get_delimiter(self): + self.assertEqual("\r\n", text.get_delimiter("abc\r\n123")) + self.assertEqual("\n", text.get_delimiter("abc\n123")) + self.assertEqual("\n", text.get_delimiter("abc")) + + def test_html_to_text(self): + html = """ +

Hello world!

+
+
    +
  • One!
  • +
  • Two
  • +
+

+Haha +

+""" + text = utils.html_to_text(html) + self.assertEqual("Hello world! \n\n * One! \n * Two \nHaha", text) + self.assertEqual("привет!", utils.html_to_text("привет!")) + + html = "

Hi" + self.assertEqual("Hi", utils.html_to_text(html)) + + html = """Hi + + +""" + self.assertEqual("Hi", utils.html_to_text(html)) + + html = """
+ +TEXT 1 +

TEXT 2

+
""" + self.assertEqual("TEXT 1 \nTEXT 2", utils.html_to_text(html)) + + def test_comment_no_parent(self): + s = " no comment" + d = html.html_document_fromstring(s) + self.assertEqual("no comment", utils.html_tree_to_text(d)) + + @patch.object(utils, "html_fromstring", Mock(return_value=None)) + def test_bad_html_to_text(self): + bad_html = "one
two
three" + self.assertEqual(None, utils.html_to_text(bad_html)) + + def test_quotation_splitter_inside_blockquote(self): + msg_body = """Reply +
+ +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Test +
+ +
""" + + self.assertEqual( + "Reply", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_quotation_splitter_outside_blockquote(self): + msg_body = """Reply + +
+On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+
+ Test +
+
+""" + self.assertEqual( + "Reply", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_regular_blockquote(self): + msg_body = """Reply +
Regular
+ +
+On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+
+
Nested
+
+
+""" + self.assertEqual( + "Reply" + + "
Regular
", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_no_blockquote(self): + msg_body = """ + + +Reply + +
+On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+Test +
+ + +""" + + reply = """ + + + +Reply + +""" + self.assertEqual( + RE_WHITESPACE.sub("", reply), + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_empty_body(self): + self.assertEqual("", html.extract_from_html("")) + + def test_validate_output_html(self): + msg_body = """Reply +
+On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: + +
+
+ Test +
+
+
+ +
+""" + out = html.extract_from_html(msg_body) + self.assertTrue( + "" in out and "" in out, + "Invalid HTML - / tag not present", + ) + self.assertTrue( + "
" not in out, "Invalid HTML output -
element is not valid" + ) + + def test_gmail_quote(self): + msg_body = """Reply +
+
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ Test +
+
+
""" + self.assertEqual( + "Reply", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_gmail_quote_compact(self): + msg_body = ( + "Reply" + '
' + '
' + + "On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:" + "
Test
" + "
" + "
" + ) + self.assertEqual( + "Reply", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_gmail_quote_blockquote(self): + msg_body = """Message +
+
+ My name is William Shakespeare. +
+
+
""" + self.assertEqual( + RE_WHITESPACE.sub("", msg_body), + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_blockquote_disclaimer(self): + msg_body = """ + + +
+
+ message +
+
+ Quote +
+
+
+ disclaimer +
+ + +""" + + stripped_html = """ + + + +
+
+ message +
+
+
+ disclaimer +
+ + +""" + self.assertEqual( + RE_WHITESPACE.sub("", stripped_html), + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_date_block(self): + msg_body = """ +
+message
+
+
+ Date: Fri, 23 Mar 2012 12:35:31 -0600
+ To: bob@example.com
+ From: rob@example.com
+ Subject: You Have New Mail From Mary!

+ + text +
+
+""" + self.assertEqual( + "
message
", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_from_block(self): + msg_body = """
+message
+
+
+From: bob@example.com
+Date: Fri, 23 Mar 2012 12:35:31 -0600
+To: rob@example.com
+Subject: You Have New Mail From Mary!

+ +text +
+""" + self.assertEqual( + "
message
", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_reply_shares_div_with_from_block(self): + msg_body = """ + +
+ + Blah

+ +
Date: Tue, 22 May 2012 18:29:16 -0600
+ To: xx@hotmail.ca
+ From: quickemail@ashleymadison.com
+ Subject: You Have New Mail From x!

+ +
+""" + self.assertEqual( + "
Blah

", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def test_reply_quotations_share_block(self): + stripped_html = text.extract_from_plain( + get_email_as_bytes("reply-quotations-share-block.eml").decode("utf-8") + ) + self.assertTrue(stripped_html) + self.assertTrue("From" not in stripped_html) + + def test_OLK_SRC_BODY_SECTION_stripped(self): + self.assertEqual( + "
Reply
", + RE_WHITESPACE.sub( + "", + html.extract_from_html(get_email_as_bytes("OLK_SRC_BODY_SECTION.html")), + ), + ) + + def test_reply_separated_by_hr(self): + self.assertEqual( + "
Hi
there
", + RE_WHITESPACE.sub( + "", + html.extract_from_html( + get_email_as_bytes("reply-separated-by-hr.html") + ), + ), + ) + + def test_from_block_and_quotations_in_separate_divs(self): + msg_body = """ +Reply +
+
+
+ + From: bob@example.com + Date: Thu, 24 Mar 2016 08:07:12 -0700 + +
+
+ Quoted message +
+
+""" + self.assertEqual( + "Reply

", + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + def extract_reply_and_check(self, filename): + kwargs = {} + kwargs["encoding"] = "utf8" + + with open(filename, **kwargs) as f: + msg_body = f.read() + reply = html.extract_from_html(msg_body) + plain_reply = utils.html_to_text(reply) + + self.assertEqual( + RE_WHITESPACE.sub("", "Hi. I am fine.\n\nThanks,\nAlex"), + RE_WHITESPACE.sub("", plain_reply), + ) + + def test_CRLF(self): + """CR is not converted to ' '""" + symbol = " " + extracted = html.extract_from_html("\r\n") + self.assertFalse(symbol in extracted) + self.assertEqual("", RE_WHITESPACE.sub("", extracted)) + + msg_body = """My +reply +
+ +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Test +
+ +
""" + msg_body = msg_body.replace("\n", "\r\n") + extracted = html.extract_from_html(msg_body) + self.assertFalse(symbol in extracted) + # Keep new lines otherwise "My reply" becomes one word - "Myreply" + self.assertEqual( + "My\nreply\n", extracted + ) + + def test_gmail_forwarded_msg(self): + msg_body = ( + '

' + + '
---------- Forwarded message ----------
' + + 'From: Bob ' + + '<bob@example.com>' + + "
Date: Fri, Feb 11, 2010 at 5:59 PM
" + + "Subject: Bob WFH today
To: Mary <" + + '' + + 'mary@example.com>


eom
' + + "

" + ) + extracted = html.extract_from_html(msg_body) + self.assertEqual( + RE_WHITESPACE.sub("", msg_body), RE_WHITESPACE.sub("", extracted) + ) + + def test_readable_html_empty(self): + msg_body = """ +
+Reply +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Test +
+ +
""" + + self.assertEqual( + RE_WHITESPACE.sub("", msg_body), + RE_WHITESPACE.sub("", html.extract_from_html(msg_body)), + ) + + @patch.object(html, "html_document_fromstring", Mock(return_value=None)) + def test_bad_html(self): + bad_html = "" + self.assertEqual(bad_html, html.extract_from_html(bad_html)) + + def test_gmail_reply(self): + self.extract_reply_and_check("mails/html_replies/gmail.html") + + def test_mail_ru_reply(self): + self.extract_reply_and_check("mails/html_replies/mail_ru.html") + + def test_hotmail_reply(self): + self.extract_reply_and_check("mails/html_replies/hotmail.html") + + def test_ms_outlook_2003_reply(self): + self.extract_reply_and_check("mails/html_replies/ms_outlook_2003.html") + + def test_ms_outlook_2007_reply(self): + self.extract_reply_and_check("mails/html_replies/ms_outlook_2007.html") + + def test_ms_outlook_2010_reply(self): + self.extract_reply_and_check("mails/html_replies/ms_outlook_2010.html") + + def test_thunderbird_reply(self): + self.extract_reply_and_check("mails/html_replies/thunderbird.html") + + def test_windows_mail_reply(self): + self.extract_reply_and_check("mails/html_replies/windows_mail.html") + + def test_yandex_ru_reply(self): + self.extract_reply_and_check("mails/html_replies/yandex_ru.html") + + @patch.object(constants, "MAX_LINES_COUNT", 1) + def test_too_many_lines(self): + msg_body = """Test reply +Hi +-----Original Message----- + +Test""" + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_pattern_on_date_somebody_wrote(self): + msg_body = """Test reply + +On 11-Apr-2011, at 6:54 PM, Roman Tkachenko wrote: + +> +> Test +> +> Roman""" + + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_pattern_on_date_polymail(self): + msg_body = """Test reply + +On Tue, Apr 11, 2017 at 10:07 PM John Smith + +< +mailto:John Smith +> wrote: +Test quoted data +""" + + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_pattern_sent_from_samsung_smb_wrote(self): + msg_body = """Test reply + +Sent from Samsung MobileName wrote: + +> +> Test +> +> Roman""" + + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_pattern_on_date_wrote_somebody(self): + self.assertEqual( + "Lorem", + text.extract_from_plain( + """Lorem + +Op 13-02-2014 3:18 schreef Julius Caesar : + +Veniam laborum mlkshk kale chips authentic. +Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up +freegan enim master cleanse. +""" + ), + ) + + def test_pattern_on_date_somebody_wrote_date_with_slashes(self): + msg_body = """Test reply + +On 04/19/2011 07:10 AM, Roman Tkachenko wrote: + +> +> Test. +> +> Roman""" + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_date_time_email_splitter(self): + msg_body = """Test reply + +2014-10-17 11:28 GMT+03:00 Postmaster < +postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>: + +> First from site +> + """ + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_pattern_on_date_somebody_wrote_allows_space_in_front(self): + msg_body = """Thanks Thanmai +On Mar 8, 2012 9:59 AM, "Example.com" < +r+7f1b094ceb90e18cca93d53d3703feae@example.com> wrote: + + +>** +> Blah-blah-blah""" + self.assertEqual("Thanks Thanmai", text.extract_from_plain(msg_body)) + + def test_pattern_on_date_somebody_sent(self): + msg_body = """Test reply + +On 11-Apr-2011, at 6:54 PM, Roman Tkachenko sent: + +> +> Test +> +> Roman""" + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_appointment(self): + msg_body = """Response + + 10/19/2017 @ 9:30 am for physical therapy + Bla + 1517 4th Avenue Ste 300 + London CA 19129, 555-421-6780 + + John Doe, FCLS + Mailgun Inc + 555-941-0697 + + From: from@example.com [mailto:from@example.com] + Sent: Wednesday, October 18, 2017 2:05 PM + To: John Doer - SIU + Subject: RE: Claim # 5551188-1 + + Text""" + + expected = """Response + + 10/19/2017 @ 9:30 am for physical therapy + Bla + 1517 4th Avenue Ste 300 + London CA 19129, 555-421-6780 + + John Doe, FCLS + Mailgun Inc + 555-941-0697""" + self.assertEqual(expected, text.extract_from_plain(msg_body)) + + def test_line_starts_with_on(self): + msg_body = """Blah-blah-blah +On blah-blah-blah""" + self.assertEqual(msg_body, text.extract_from_plain(msg_body)) + + def test_reply_and_quotation_splitter_share_line(self): + # reply lines and 'On wrote:' splitter pattern + # are on the same line + msg_body = """reply On Wed, Apr 4, 2012 at 3:59 PM, bob@example.com wrote: +> Hi""" + self.assertEqual("reply", text.extract_from_plain(msg_body)) + + # test pattern '--- On wrote:' with reply text on + # the same line + msg_body = """reply--- On Wed, Apr 4, 2012 at 3:59 PM, me@domain.com wrote: +> Hi""" + self.assertEqual("reply", text.extract_from_plain(msg_body)) + + # test pattern '--- On wrote:' with reply text containing + # '-' symbol + msg_body = """reply +bla-bla - bla--- On Wed, Apr 4, 2012 at 3:59 PM, me@domain.com wrote: +> Hi""" + reply = """reply +bla-bla - bla""" + + self.assertEqual(reply, text.extract_from_plain(msg_body)) + + def test_android_wrote(self): + msg_body = """Test reply + +---- John Smith wrote ---- + +> quoted +> text +""" + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_reply_wraps_quotations(self): + msg_body = """Test reply + +On 04/19/2011 07:10 AM, Roman Tkachenko wrote: + +> +> Test + +Regards, Roman""" + + reply = """Test reply + +Regards, Roman""" + + self.assertEqual(reply, text.extract_from_plain(msg_body)) + + def test_reply_wraps_nested_quotations(self): + msg_body = """Test reply +On 04/19/2011 07:10 AM, Roman Tkachenko wrote: + +>Test test +>On 04/19/2011 07:10 AM, Roman Tkachenko wrote: +> +>> +>> Test. +>> +>> Roman + +Regards, Roman""" + + reply = """Test reply +Regards, Roman""" + self.assertEqual(reply, text.extract_from_plain(msg_body)) + + def test_quotation_separator_takes_2_lines(self): + msg_body = """Test reply + +On Fri, May 6, 2011 at 6:03 PM, Roman Tkachenko from Hacker News + wrote: + +> Test. +> +> Roman + +Regards, Roman""" + + reply = """Test reply + +Regards, Roman""" + self.assertEqual(reply, text.extract_from_plain(msg_body)) + + def test_quotation_separator_takes_3_lines(self): + msg_body = """Test reply + +On Nov 30, 2011, at 12:47 PM, Somebody < +416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4@somebody.domain.com> +wrote: + +Test message +""" + self.assertEqual("Test reply", text.extract_from_plain(msg_body)) + + def test_short_quotation(self): + msg_body = """Hi + +On 04/19/2011 07:10 AM, Roman Tkachenko wrote: + +> Hello""" + self.assertEqual("Hi", text.extract_from_plain(msg_body)) + + def test_with_indent(self): + msg_body = """ +YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. + +------On 12/29/1987 17:32 PM, Julius Caesar wrote----- + +Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. + """ + self.assertEqual( + "YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", + text.extract_from_plain(msg_body), + ) + + def test_short_quotation_with_newline(self): + msg_body = """Btw blah blah... + +On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote: + +Hi Mark, +Blah blah?  +Thanks,Christine  + +On Jan 27, 2015, at 11:55 AM, Mark XXX wrote: + +Lorem ipsum? +Mark + +Sent from Acompli""" + self.assertEqual("Btw blah blah...", text.extract_from_plain(msg_body)) + + def test_pattern_date_email_with_unicode(self): + msg_body = """Replying ok +2011/4/7 Nathan \xd0\xb8ova + +> Cool beans, scro""" + self.assertEqual("Replying ok", text.extract_from_plain(msg_body)) + + def test_english_from_block(self): + self.assertEqual( + "Allo! Follow up MIME!", + text.extract_from_plain( + """Allo! Follow up MIME! + +From: somebody@example.com +Sent: March-19-11 5:42 PM +To: Somebody +Subject: The manager has commented on your Loop + +Blah-blah-blah +""" + ), + ) + + def test_german_from_block(self): + self.assertEqual( + "Allo! Follow up MIME!", + text.extract_from_plain( + """Allo! Follow up MIME! + +Von: somebody@example.com +Gesendet: Dienstag, 25. November 2014 14:59 +An: Somebody +Betreff: The manager has commented on your Loop + +Blah-blah-blah +""" + ), + ) + + def test_french_multiline_from_block(self): + self.assertEqual( + "Lorem ipsum", + text.extract_from_plain( + """Lorem ipsum + +De : Brendan xxx [mailto:brendan.xxx@xxx.com] +Envoyé : vendredi 23 janvier 2015 16:39 +À : Camille XXX +Objet : Follow Up + +Blah-blah-blah +""" + ), + ) + + def test_french_from_block(self): + self.assertEqual( + "Lorem ipsum", + text.extract_from_plain( + """Lorem ipsum + + Le 23 janv. 2015 à 22:03, Brendan xxx + > a écrit: + + Bonjour!""" + ), + ) + + def test_polish_from_block(self): + self.assertEqual( + "Lorem ipsum", + text.extract_from_plain( + """Lorem ipsum + +W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx +napisał: + +Blah! +""" + ), + ) + + def test_danish_from_block(self): + self.assertEqual( + "Allo! Follow up MIME!", + text.extract_from_plain( + """Allo! Follow up MIME! + +Fra: somebody@example.com +Sendt: 19. march 2011 12:10 +Til: Somebody +Emne: The manager has commented on your Loop + +Blah-blah-blah +""" + ), + ) + + def test_swedish_from_block(self): + self.assertEqual( + "Allo! Follow up MIME!", + text.extract_from_plain( + """Allo! Follow up MIME! +Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com] +Skickat: den 26 augusti 2015 14:45 +Till: Isacson Leiff +Ämne: RE: Week 36 + +Blah-blah-blah +""" + ), + ) + + def test_swedish_from_line(self): + self.assertEqual( + "Lorem", + text.extract_from_plain( + """Lorem +Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: + +Veniam laborum mlkshk kale chips authentic. +Normcore mumblecore laboris, fanny pack +readymade eu blog chia pop-up freegan enim master cleanse. +""" + ), + ) + + def test_norwegian_from_line(self): + self.assertEqual( + "Lorem", + text.extract_from_plain( + """Lorem +På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: + +Veniam laborum mlkshk kale chips authentic. +Normcore mumblecore laboris, fanny pack +readymade eu blog chia pop-up freegan enim master cleanse. +""" + ), + ) + + def test_dutch_from_block(self): + self.assertEqual( + "Gluten-free culpa lo-fi et nesciunt nostrud.", + text.extract_from_plain( + """Gluten-free culpa lo-fi et nesciunt nostrud. + +Op 17-feb.-2015, om 13:18 heeft Julius Caesar + het volgende geschreven: + +Small batch beard laboris tempor, non listicle hella Tumblr heirloom. +""" + ), + ) + + def test_vietnamese_from_block(self): + self.assertEqual( + "Hello", + text.extract_from_plain( + """Hello + +Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn đã viết: + +> Xin chào +""" + ), + ) + + def test_quotation_marker_false_positive(self): + msg_body = """Visit us now for assistance... +>>> >>> http://www.domain.com <<< +Visit our site by clicking the link above""" + self.assertEqual(msg_body, text.extract_from_plain(msg_body)) + + def test_link_closed_with_quotation_marker_on_new_line(self): + msg_body = """8.45am-1pm + +From: somebody@example.com +Date: Wed, 16 May 2012 00:15:02 -0600 + + > + +Requester: """ + self.assertEqual("8.45am-1pm", text.extract_from_plain(msg_body)) + + def test_link_breaks_quotation_markers_sequence(self): + # link starts and ends on the same line + msg_body = """Blah + +On Thursday, October 25, 2012 at 3:03 PM, life is short. on Bob wrote: + +> +> Post a response by replying to this email +> +(http://example.com/c/YzOTYzMmE) > +> life is short. (http://example.com/c/YzMmE) +> +""" + self.assertEqual("Blah", text.extract_from_plain(msg_body)) + + # link starts after some text on one line and ends on another + msg_body = """Blah + +On Monday, 24 September, 2012 at 3:46 PM, bob wrote: + +> [Ticket #50] test from bob +> +> View ticket (http://example.com/action +_nonce=3dd518) +> +""" + self.assertEqual("Blah", text.extract_from_plain(msg_body)) + + def test_from_block_starts_with_date(self): + msg_body = """Blah + +Date: Wed, 16 May 2012 00:15:02 -0600 +To: klizhentas@example.com + +""" + self.assertEqual("Blah", text.extract_from_plain(msg_body)) + + def test_bold_from_block(self): + msg_body = """Hi + +*From:* bob@example.com [mailto: +bob@example.com] +*Sent:* Wednesday, June 27, 2012 3:05 PM +*To:* travis@example.com +*Subject:* Hello + +""" + self.assertEqual("Hi", text.extract_from_plain(msg_body)) + + def test_weird_date_format_in_date_block(self): + msg_body = """Blah +Date: Fri=2C 28 Sep 2012 10:55:48 +0000 +From: tickets@example.com +To: bob@example.com +Subject: [Ticket #8] Test + +""" + self.assertEqual("Blah", text.extract_from_plain(msg_body)) + + def test_dont_parse_quotations_for_forwarded_messages(self): + msg_body = """FYI + +---------- Forwarded message ---------- +From: bob@example.com +Date: Tue, Sep 4, 2012 at 1:35 PM +Subject: Two +line subject +To: rob@example.com + +Text""" + self.assertEqual(msg_body, text.extract_from_plain(msg_body)) + + def test_forwarded_message_in_quotations(self): + msg_body = """Blah + +-----Original Message----- + +FYI + +---------- Forwarded message ---------- +From: bob@example.com +Date: Tue, Sep 4, 2012 at 1:35 PM +Subject: Two +line subject +To: rob@example.com + +""" + self.assertEqual("Blah", text.extract_from_plain(msg_body)) + + def test_mark_message_lines(self): + # e - empty line + # s - splitter line + # m - line starting with quotation marker '>' + # t - the rest + + lines = [ + "Hello", + "", + # next line should be marked as splitter + "_____________", + "From: foo@bar.com", + "Date: Wed, 16 May 2012 00:15:02 -0600", + "", + "> Hi", + "", + "Signature", + ] + self.assertEqual("tesssemet", text.mark_message_lines(lines)) + + lines = [ + "Just testing the email reply", + "", + "Robert J Samson", + "Sent from my iPhone", + "", + # all 3 next lines should be marked as splitters + "On Nov 30, 2011, at 12:47 PM, Skapture <", + ( + "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4" + "@skapture-staging.mailgun.org>" + ), + "wrote:", + "", + "Tarmo Lehtpuu has posted the following message on", + ] + self.assertEqual("tettessset", text.mark_message_lines(lines)) + + def test_process_marked_lines(self): + # quotations and last message lines are mixed + # consider all to be a last message + markers = "tsemmtetm" + lines = [str(i) for i in range(len(markers))] + lines = [str(i) for i in range(len(markers))] + + self.assertEqual(lines, text.process_marked_lines(lines, markers)) + + # no splitter => no markers + markers = "tmm" + lines = ["1", "2", "3"] + self.assertEqual(["1", "2", "3"], text.process_marked_lines(lines, markers)) + + # text after splitter without markers is quotation + markers = "tst" + lines = ["1", "2", "3"] + self.assertEqual(["1"], text.process_marked_lines(lines, markers)) + + # message + quotation + signature + markers = "tsmt" + lines = ["1", "2", "3", "4"] + self.assertEqual(["1", "4"], text.process_marked_lines(lines, markers)) + + # message + + nested quotation + markers = "tstsmt" + lines = ["1", "2", "3", "4", "5", "6"] + self.assertEqual(["1"], text.process_marked_lines(lines, markers)) + + # test links wrapped with paranthesis + # link starts on the marker line + markers = "tsmttem" + lines = [ + "text", + "splitter", + ">View (http://example.com", + "/abc", + ")", + "", + "> quote", + ] + self.assertEqual(lines[:1], text.process_marked_lines(lines, markers)) + + # link starts on the new line + markers = "tmmmtm" + lines = [ + "text", + ">" ">", + ">", + "(http://example.com) > ", + "> life is short. (http://example.com) ", + ] + self.assertEqual(lines[:1], text.process_marked_lines(lines, markers)) + + # check all "inline" replies + markers = "tsmtmtm" + lines = [ + "text", + "splitter", + ">", + "(http://example.com)", + ">", + "inline reply", + ">", + ] + self.assertEqual(lines, text.process_marked_lines(lines, markers)) + + # inline reply with link not wrapped in paranthesis + markers = "tsmtm" + lines = [ + "text", + "splitter", + ">", + "inline reply with link http://example.com", + ">", + ] + self.assertEqual(lines, text.process_marked_lines(lines, markers)) + + # inline reply with link wrapped in paranthesis + markers = "tsmtm" + lines = ["text", "splitter", ">", "inline reply (http://example.com)", ">"] + self.assertEqual(lines, text.process_marked_lines(lines, markers)) + + def test_preprocess(self): + msg = ( + "Hello\n" + "See for more\n" + "information On Nov 30, 2011, at 12:47 PM, Somebody <\n" + "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n" + "@example.com>" + "wrote:\n" + "\n" + "> Hi" + ) + + # test the link is rewritten + # 'On wrote:' pattern starts from a new line + prepared_msg = ( + "Hello\n" + "See @@http://google.com\n" + "@@ for more\n" + "information\n" + " On Nov 30, 2011, at 12:47 PM, Somebody <\n" + "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n" + "@example.com>" + "wrote:\n" + "\n" + "> Hi" + ) + self.assertEqual(prepared_msg, utils.preprocess(msg, "\n")) + + msg = """ +> MxNjQ4Y2RmOTNlMCZyPXNlcmdleS5v**YnlraG92JTQwbWFpbGd1bmhxLmNvbS** + +> Z0PSUyQSZkPWUwY2U + """ + self.assertEqual(msg, utils.preprocess(msg, "\n")) + + # 'On wrote' shouldn't be spread across too many lines + msg = ( + "Hello\n" + "How are you? On Nov 30, 2011, at 12:47 PM,\n " + "Example <\n" + "416ffd3258d4d2fa4c85cfa4c44e1721d66e3e8f4\n" + "@example.org>" + "wrote:\n" + "\n" + "> Hi" + ) + self.assertEqual(msg, utils.preprocess(msg, "\n")) + + msg = "Hello On Nov 30, smb wrote:\n" "Hi\n" "On Nov 29, smb wrote:\n" "hi" + + prepared_msg = ( + "Hello\n" " On Nov 30, smb wrote:\n" "Hi\n" "On Nov 29, smb wrote:\n" "hi" + ) + + self.assertEqual(prepared_msg, utils.preprocess(msg, "\n")) + + def test_preprocess_postprocess_2_links(self): + msg_body = " " + self.assertEqual(msg_body, text.extract_from_plain(msg_body)) + + def test_feedback_below_left_unparsed(self): + msg_body = """Please enter your feedback below. Thank you. + +------------------------------------- +Enter Feedback Below +------------------------------------- + +The user experience was unparallelled. Please continue production. +I'm sending payment to ensure +that this line is intact.""" + + parsed = text.extract_from_plain(msg_body) + self.assertEqual(msg_body, parsed) + if __name__ == "__main__": unittest.main(verbosity=2) From d334265ed370109125180413df9893db63df5b6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Orze=C5=82?= Date: Tue, 23 Apr 2024 15:15:59 +0200 Subject: [PATCH 6/6] =?UTF-8?q?Usuni=C4=99cie=20pustych=20linii?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extract_raw_content/constants.py | 1 - extract_raw_content/html_quotations.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_raw_content/constants.py b/extract_raw_content/constants.py index 3e07a94..e5da66b 100644 --- a/extract_raw_content/constants.py +++ b/extract_raw_content/constants.py @@ -1,6 +1,5 @@ import re - MAX_LINES_COUNT = 1000 SPLITTER_MAX_LINES = 6 _MAX_TAGS_COUNT = 419 diff --git a/extract_raw_content/html_quotations.py b/extract_raw_content/html_quotations.py index b361661..f312eac 100644 --- a/extract_raw_content/html_quotations.py +++ b/extract_raw_content/html_quotations.py @@ -1,4 +1,5 @@ import re + from lxml.cssselect import CSSSelector CHECKPOINT_PREFIX = "#!%!"