diff --git a/.gitignore b/.gitignore index 71f7c64..afc0a75 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ tests/.DS_Store .DS_Store *.egg-info .project +env/ +venv/ dist/ dist/* - +*.csv +__pycache__/ +*.json diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 063f65b..c587c58 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,80 +1,172 @@ """ - email_reply_parser is a python library port of GitHub's Email Reply Parser. - - For more information, visit https://github.com/zapier/email-reply-parser +email_reply_parser is a python library port of GitHub's Email Reply Parser. +For more information, visit https://github.com/zapier/email_reply_parser """ - +import os import re +import json class EmailReplyParser(object): """ Represents a email message that is parsed. """ + def __init__(self, language='en'): + dir_path = os.path.dirname(__file__) + with open(dir_path + "/languages_support.json", "r") as read_file: + self.words_map = json.load(read_file) + if language in self.words_map: + self.language = language + else: + self.language = 'en' - @staticmethod - def read(text): + def read(self, text): """ Factory method that splits email into list of fragments - text - A string email body - Returns an EmailMessage instance """ - return EmailMessage(text).read() + return EmailMessage(text, self.language, self.words_map).read() - @staticmethod - def parse_reply(text): + def parse_reply(self, text): """ Provides the reply portion of email. - text - A string email body - Returns reply body message """ - return EmailReplyParser.read(text).reply + return self.read(text).reply class EmailMessage(object): """ An email message represents a parsed email body. """ - - SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - QUOTE_HDR_REGEX = re.compile('On.*wrote:$') - QUOTED_REGEX = re.compile(r'(>+)') - HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') - _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' - MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) - MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) - - def __init__(self, text): + def __init__(self, text, language, words_map): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') self.found_visible = False + self.SIG_REGEX = None + self.QUOTE_HDR_REGEX = None + self.QUOTED_REGEX = None + self.HEADER_REGEX = None + self._MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None + self.words_map = words_map + self.language = language + self.default_language = 'en' + self.set_regex() + + def default_quoted_header(self): + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^[* ]?(' + self.words_map[self.language]['From'] \ + + '|' + self.words_map[self.language]['Sent'] \ + + '|' + self.words_map[self.language]['To'] + + ')\s*:\*? .+|.+(mailto:).+' + ) + + def warnings(self): + self.WARNING_REGEX = re.compile( + r'(CAUTION:|NOTICE:|Confidentiality Notice:|Please do not reply|This electronic mail' + r'|Disclaimer: This message is intended' + r'|This message and any attachments are solely' + r'|This email contains privileged information' + r'|The information contained|This email has been scanned|This message and any associated files' + r'|This email and any files transmitted|This message is for the recipients' + r'|The information provided within this communication' + r'|This message (including any attachments) is intended' + r'|The [cC]ontents are confidential|This communication with its contents' + r'|Please consider the environment before printing this email) [a-zA-Z0-9:;.,?!()@&/\'\"\“\” \-]*' + ) + + def nl_support(self): + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})' + ) + self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' + + def de_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + + def fr_support(self): + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ + + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations' + r'|cdlt|cdt|crdt|regards|best regard|bonne journ[ée]e))', + re.IGNORECASE + ) + self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ -]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ -]*:)' + + def en_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') + self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' + + def es_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Enviado desde (\w+\s*){1,6})') + self.QUOTE_HDR_REGEX = re.compile('\s*El.*escribió\s*:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!El.*El\s.+?escribió\s*:)(El\s(.+?)escribió\s*:)' + + def ja_support(self): + self.SIG_REGEX = re.compile(r'--|__|-\w') + self.QUOTE_HDR_REGEX = re.compile( + r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$' + ) + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' # Dummy multiline: doesnt work for japanese due to BeautifulSoup insreting new lines before ":" character + + def fi_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') + self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello.+?)?:)') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)' + + def set_regex(self): + if hasattr(self, self.language+"_support"): + getattr(self, self.language+"_support")() + self.default_quoted_header() + else: + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] \ + + '|' + self.words_map[self.default_language]['Sent from'] \ + + ')(\w+\s*){1,3})' + ) + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] \ + + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' + self.warnings() + self.FOLLOW_UP_HDR_REGEX = re.compile(r'(? +日期: 2020年3月6日 星期五 上午1:04 +收件人: ""zys@hotmail.com"" +主题: xyz.com - Order <111111111> has been shipped + +Shipping Confirmation + +Dear XYZ ZUS, + +Thank you for shopping at ZYS! + +Your order has been shipped. Your shipment details are shown below. + +Please note that it may take up to 24 hours for the shipping carrier to update the tracking information. + +If you have any questions you can contact us at + +ORDER + +SHIPPING diff --git a/test/emails/chinese2.txt b/test/emails/chinese2.txt new file mode 100644 index 0000000..fe6efa6 --- /dev/null +++ b/test/emails/chinese2.txt @@ -0,0 +1,18 @@ +Dear bbb +This is X. I just brought one sneaker and two slippers . Can I cancel my order please thanks + +XYZ Customer Care 于2020年3月9日 周一下午9:10写道: + +Thank you for shopping at SSENSE. Please allow us up to two business days to process your order. You’ll find a copy of your receipt and order information. + +MY ACCOUNT () CUSTOMER SUPPORT () + +Order Confirmation + +Dear XYZ XYZ, + +Thank you for shopping at ZYZ. Please allow us up to two business days to process your order. Once it’s been processed, you’ll receive a shipment confirmation email with your order’s tracking number. + +Below, you’ll find a copy of your receipt and order information. Please keep it for your records. + +\ diff --git a/test/emails/emails.json b/test/emails/emails.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/test/emails/emails.json @@ -0,0 +1 @@ +[] diff --git a/test/emails/forward.txt b/test/emails/forward.txt new file mode 100644 index 0000000..a715ce3 --- /dev/null +++ b/test/emails/forward.txt @@ -0,0 +1,6 @@ +FW: YYY Arrival Notice XYZ - YYY ELA/XYZ ETA: 2020-06-08 +This is a follow-up to your previous request #12345 "RE: XYZ and Manifest amendm..." +Hello team, +Can I get 1 Arrival Notice without PU# and invoice? +Thank you. +Best regards diff --git a/test/emails/multi_header.txt b/test/emails/multi_header.txt new file mode 100644 index 0000000..8eeded0 --- /dev/null +++ b/test/emails/multi_header.txt @@ -0,0 +1,41 @@ +No problem. I’ll just start a new order. + +On May 30, 2020, at 4:24 PM, XYZ wrote: + + +Hi XYZ, + +Unfortunately, we are unable to add items to your order, but if you would like we can cancel your order and issue a full refund so that you may order again with your preferred selection of pastries. This may result in a later delivery date, but please let us know if you would like us to cancel your order and we will set that up for you. + + + +On May 30, 2020, 4:15 PM XYZ xyz@xyz.com wrote: + +No worries and thank you. I wanted to add a couple of new items to the same shipment. Would that Be possible or should I just order thru the website? + +On May 30, 2020, at 3:54 PM, XYZ wrote: + + +Hi XYZ, + +We apologize for the incorrect product and for any inconvenience this may have caused. + +We have placed a replacement order of 1 X for delivery on June 4, 2020. You will be receiving an email confirmation for this new order and your tracking number will be emailed 1-2 nights before the delivery date. + +Thank you for your patience! + + + +On May 30, 2020, 10:01 AM XYZ xyz@xyz.com wrote: + +Hello XYZ, + +Thank you for emailing the Team! + +We are writing to you to confirm that we have received your email. + +We apologize for any inconvenience and assure you that we will find a solution for any question, concern, or comment you may have. + +We appreciate your patience during these times. + +-Team diff --git a/test/performance.py b/test/performance.py new file mode 100644 index 0000000..127ea72 --- /dev/null +++ b/test/performance.py @@ -0,0 +1,74 @@ +import pandas as pd +# import numpy as np +import json +import time +from bs4 import BeautifulSoup # requires lxml +from email_reply_parser import EmailReplyParser + +def profile(): + df = pd.read_csv('test.csv') + ground = time.time() + content = df.content.values[np.argmax([len(d) for d in df.content.values])] + start = time.time() + parser = EmailReplyParser(language='fr') + print(str(time.time() - start) + 'init parser') + start = time.time() + res = parser.parse_reply(content) + print(str(time.time() - start) + 'parse') + start = time.time() + soup = BeautifulSoup(res, 'lxml') + text = soup.getText(' ') + print(str(time.time() - start) + 'soup') + print(f'Total time: {time.time() - ground}') + +def verify(): + parser = EmailReplyParser(language='fi') + texts = json.load(open('test/emails/emails.json')) + texts = list(filter(lambda d: type(d) == str, texts)) + parsed = [] + for text in texts: + print('-'*100) + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + print(text) + +def parse_df(): + parser = EmailReplyParser(language='en') + path = 'test/emails/zipwrotetest.csv' + df = pd.read_csv(path) + parsed = [] + for text in df.sentence.values: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + df = df.assign(clean=parsed) + df.to_csv(path) + import code + code.interact(local=locals()) + +def parse_json(): + parser = EmailReplyParser(language='en') + with open('english.json', 'rb') as fl: + messages = json.load(fl) + parsed = [] + for text in messages: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + import code + code.interact(local=locals()) + +def parse_text(): + parser = EmailReplyParser(language='en') + with open('test/emails/caution.txt', 'r') as fl: + message = fl.read() + text = parser.parse_reply(message) + print(text) + +if __name__ == '__main__': + parse_text() + # parse_text()