From b919419a12931f3ed46651e2910b2cf8c9bcce55 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 21 Feb 2019 12:42:47 +0000 Subject: [PATCH 01/58] language support --- email_reply_parser/__init__.py | 85 +++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 063f65b..cb0f1bf 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,54 +1,105 @@ """ email_reply_parser is a python library port of GitHub's Email Reply Parser. - For more information, visit https://github.com/zapier/email-reply-parser + For more information, visit https://github.com/zapier/email_reply_parser """ import re +import json class EmailReplyParser(object): """ Represents a email message that is parsed. """ + def __init__(self, language='en'): + self.language = language - @staticmethod - def read(text): + def read(self, text): """ Factory method that splits email into list of fragments text - A string email body Returns an EmailMessage instance """ - return EmailMessage(text).read() + return EmailMessage(text, self.language).read() - @staticmethod - def parse_reply(text): + def parse_reply(self, text): """ Provides the reply portion of email. text - A string email body Returns reply body message """ - return EmailReplyParser.read(text).reply + return self.read(text).reply class EmailMessage(object): """ An email message represents a parsed email body. """ - - SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - QUOTE_HDR_REGEX = re.compile('On.*wrote:$') - QUOTED_REGEX = re.compile(r'(>+)') - HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') - _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' - MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) - MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) - - def __init__(self, text): + def __init__(self, text, language): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') self.found_visible = False + self.SIG_REGEX = None + self.QUOTE_HDR_REGEX = None + self.QUOTED_REGEX = None + self.HEADER_REGEX = None + self._MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None + with open("languages_support.json", "r") as read_file: + self.words_diff_languages = json.load(read_file) + self.language = language + self.set_regex() + + def default_quoted_header(self): + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^\*?(' + self.words_diff_languages[self.language]['From'] + + '|' + self.words_diff_languages[self.language]['Sent'] + + '|' + self.words_diff_languages[self.language]['To'] + + '|' + self.words_diff_languages[self.language]['Subject'] + + '):\*? .+' + ) + + def nl_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' + + def de_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^\*?(' + self.words_diff_languages[self.language]['From'] + + '|' + self.words_diff_languages[self.language]['Sent'] + + '|' + self.words_diff_languages[self.language]['To'] + + '|' + self.words_diff_languages[self.language]['Subject'] + + '):\*? .+' + ) + self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + + def en_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + + def set_regex(self): + if hasattr(self, self.language+"_support"): + getattr(self, self.language+"_support")() + else: + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \ + ':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)' + self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) def read(self): """ Creates new fragment for each line From 2517a4166f4de5d01a4e193dc558174c6b42fc26 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 21 Feb 2019 15:07:49 +0000 Subject: [PATCH 02/58] json conf --- support/languages_support.json | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 support/languages_support.json diff --git a/support/languages_support.json b/support/languages_support.json new file mode 100644 index 0000000..aeedb2a --- /dev/null +++ b/support/languages_support.json @@ -0,0 +1,162 @@ +{ + "vi": { + "Sent from": "\u0110\u01b0\u1ee3c g\u1eedi t\u1eeb", + "From": "T\u1eeb", + "To": "\u0110\u1ebfn", + "wrote": "\u0111\u00e3 vi\u1ebft", + "Sent": "G\u1edfi", + "Subject": "M\u00f4n h\u1ecdc" + }, + "ru": { + "Sent from": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e \u0438\u0437", + "From": "\u041e\u0442", + "To": "\u043a", + "wrote": "\u043f\u0438\u0441\u0430\u043b", + "Sent": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e", + "Subject": "\u041f\u0440\u0435\u0434\u043c\u0435\u0442" + }, + "fr": { + "Sent from": "Envoy\u00e9 depuis", + "From": "De", + "To": "\u00c0", + "wrote": "a \u00e9crit", + "Sent": "Envoy\u00e9", + "Subject": "Objet" + }, + "en": { + "Sent from": "Sent from", + "From": "From", + "To": "To", + "wrote": "wrote", + "Sent": "Sent", + "Subject": "Subject" + }, + "nl": { + "Sent from": "Verzonden met", + "From": "Van", + "To": "Aan", + "wrote": "schreef", + "Sent": "Verzonden", + "Subject": "Onderwerp" + }, + "pt": { + "Sent from": "Enviado de", + "From": "De", + "To": "Para", + "wrote": "escrevi", + "Sent": "Enviei", + "Subject": "Sujeito" + }, + "ko": { + "Sent from": "\ubd80\ud130 \ubcf4\ub0b4\uc9c4", + "From": "\uc5d0\uc11c", + "To": "\uc5d0", + "wrote": "\uc4f4", + "Sent": "\uc804\uc1a1 \ub428", + "Subject": "\uc81c\ubaa9" + }, + "de": { + "Sent from": "Gesendet von", + "From": "Von", + "To": "An", + "wrote": "schrieb", + "Sent": "geschickt", + "Subject": "Betreff" + }, + "tr": { + "Sent from": "Den g\u00f6nderildi", + "From": "itibaren", + "To": "i\u00e7in", + "wrote": "yazd\u0131", + "Sent": "G\u00f6nderilen", + "Subject": "konu" + }, + "it": { + "Sent from": "Inviato da", + "From": "Da", + "To": "A", + "wrote": "ha scritto", + "Sent": "Inviato", + "Subject": "Oggetto" + }, + "id": { + "Sent from": "Dikirim dari", + "From": "Dari", + "To": "Untuk", + "wrote": "menulis", + "Sent": "Terkirim", + "Subject": "Subyek" + }, + "sk": { + "Sent from": "Odoslan\u00e9 od", + "From": "z", + "To": "na", + "wrote": "nap\u00edsal", + "Sent": "odoslan\u00e9", + "Subject": "predmet" + }, + "ar": { + "Sent from": "\u0627\u0631\u0633\u0644\u062a \u0645\u0646", + "From": "\u0645\u0646 \u0639\u0646\u062f", + "To": "\u0625\u0644\u0649", + "wrote": "\u0643\u062a\u0628", + "Sent": "\u0623\u0631\u0633\u0644\u062a", + "Subject": "\u0645\u0648\u0636\u0648\u0639" + }, + "es": { + "Sent from": "Enviado desde", + "From": "De", + "To": "Para", + "wrote": "escribi\u00f3", + "Sent": "Expedido", + "Subject": "Asunto" + }, + "th": { + "Sent from": "\u0e2a\u0e48\u0e07\u0e08\u0e32\u0e01", + "From": "\u0e08\u0e32\u0e01", + "To": "\u0e44\u0e1b\u0e22\u0e31\u0e07", + "wrote": "\u0e40\u0e02\u0e35\u0e22\u0e19", + "Sent": "\u0e2a\u0e48\u0e07", + "Subject": "\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07" + }, + "fi": { + "Sent from": "L\u00e4hetetty", + "From": "alkaen", + "To": "jotta", + "wrote": "kirjoitti", + "Sent": "L\u00e4hetetyt", + "Subject": "aihe" + }, + "zh": { + "Sent from": "\u6765\u81ea", + "From": "\u4ece", + "To": "\u81f3", + "wrote": "\u5199", + "Sent": "\u53d1\u9001", + "Subject": "\u5b66\u79d1" + }, + "ja": { + "Sent from": "\u9001\u4fe1\u5143", + "From": "\u304b\u3089", + "To": "\u306b", + "wrote": "\u66f8\u304d\u307e\u3057\u305f", + "Sent": "\u9001\u4fe1\u6e08\u307f", + "Subject": "\u4ef6\u540d" + }, + "pl": { + "Sent from": "Wys\u0142ane z", + "From": "Z", + "To": "Do", + "wrote": "napisa\u0142", + "Sent": "Wys\u0142ane", + "Subject": "Przedmiot" + }, + "he": { + "Sent from": "\u05e0\u05e9\u05dc\u05d7 \u05de", + "From": "\u05de", + "To": "\u05dc", + "wrote": "\u05db\u05ea\u05d1\u05ea\u05d9", + "Sent": "\u05e0\u05e9\u05dc\u05d7", + "Subject": "\u05e0\u05d5\u05e9\u05d0" + } +} \ No newline at end of file From da6ea80619004f514d7b764260b34a4f875a9880 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:19:17 +0000 Subject: [PATCH 03/58] languages support path --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index cb0f1bf..745c5f4 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,7 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("languages_support.json", "r") as read_file: + with open("../support/languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() From 38da7751d454781e5ef220b58a7878fe53f89209 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:26:14 +0000 Subject: [PATCH 04/58] move json into parser module --- email_reply_parser/__init__.py | 2 +- {support => email_reply_parser}/languages_support.json | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {support => email_reply_parser}/languages_support.json (100%) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 745c5f4..cb0f1bf 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,7 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("../support/languages_support.json", "r") as read_file: + with open("languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() diff --git a/support/languages_support.json b/email_reply_parser/languages_support.json similarity index 100% rename from support/languages_support.json rename to email_reply_parser/languages_support.json From 929e78ccf6a306dd6035dc96245948da06853c20 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:30:19 +0000 Subject: [PATCH 05/58] dir_path --- email_reply_parser/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index cb0f1bf..a15a31e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,8 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("languages_support.json", "r") as read_file: + dir_path = os.path.dirname(__file__) + with open(dir_path + "/languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() From aa5b02bdbd0365c714f0364c9c37aae7b2bd3d94 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:31:39 +0000 Subject: [PATCH 06/58] os --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index a15a31e..9e50417 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -3,7 +3,7 @@ For more information, visit https://github.com/zapier/email_reply_parser """ - +import os import re import json From 7fdecf3227897fa1c665c2581cece17c14c832ba Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 11:32:19 +0000 Subject: [PATCH 07/58] build with language support --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5d3078e..6f25115 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,10 @@ version=version.VERSION, description='Email reply parser', packages=['email_reply_parser'], - package_data={'email_reply_parser': ['../VERSION']}, + package_data={ + 'email_reply_parser': ['../VERSION'], + '': ['./languages_support.json'] + }, author='Royce Haynes', author_email='royce.haynes@gmail.com', url='https://github.com/zapier/email-reply-parser', @@ -32,4 +35,4 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", ] -) \ No newline at end of file +) From ffca4537983509d9235fc9f50dcb7737fa984476 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 14:36:32 +0000 Subject: [PATCH 08/58] load json once --- email_reply_parser/__init__.py | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 9e50417..51d8829 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -12,6 +12,8 @@ class EmailReplyParser(object): """ Represents a email message that is parsed. """ def __init__(self, language='en'): + with open(dir_path + "/languages_support.json", "r") as read_file: + self.words_map = json.load(read_file) self.language = language def read(self, text): @@ -21,7 +23,7 @@ def read(self, text): Returns an EmailMessage instance """ - return EmailMessage(text, self.language).read() + return EmailMessage(text, self.language, self.words_map).read() def parse_reply(self, text): """ Provides the reply portion of email. @@ -36,7 +38,7 @@ def parse_reply(self, text): class EmailMessage(object): """ An email message represents a parsed email body. """ - def __init__(self, text, language): + def __init__(self, text, language, words_map): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') @@ -49,36 +51,35 @@ def __init__(self, text, language): self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None dir_path = os.path.dirname(__file__) - with open(dir_path + "/languages_support.json", "r") as read_file: - self.words_diff_languages = json.load(read_file) + self.words_map = words_map self.language = language self.set_regex() def default_quoted_header(self): self.QUOTED_REGEX = re.compile(r'(>+)') self.HEADER_REGEX = re.compile( - r'^\*?(' + self.words_diff_languages[self.language]['From'] + - '|' + self.words_diff_languages[self.language]['Sent'] + - '|' + self.words_diff_languages[self.language]['To'] + - '|' + self.words_diff_languages[self.language]['Subject'] + + r'^\*?(' + self.words_map[self.language]['From'] + + '|' + self.words_map[self.language]['Sent'] + + '|' + self.words_map[self.language]['To'] + + '|' + self.words_map[self.language]['Subject'] + '):\*? .+' ) def nl_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' def de_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') self.QUOTED_REGEX = re.compile(r'(>+)') self.HEADER_REGEX = re.compile( - r'^\*?(' + self.words_diff_languages[self.language]['From'] + - '|' + self.words_diff_languages[self.language]['Sent'] + - '|' + self.words_diff_languages[self.language]['To'] + - '|' + self.words_diff_languages[self.language]['Subject'] + + r'^\*?(' + self.words_map[self.language]['From'] + + '|' + self.words_map[self.language]['Sent'] + + '|' + self.words_map[self.language]['To'] + + '|' + self.words_map[self.language]['Subject'] + '):\*? .+' ) self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' @@ -94,11 +95,11 @@ def set_regex(self): if hasattr(self, self.language+"_support"): getattr(self, self.language+"_support")() else: - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') - self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$') self.default_quoted_header() - self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \ - ':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ + ':)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) From 1eb7e86d615f4e45584a80e36da70edc61e012df Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 14:46:00 +0000 Subject: [PATCH 09/58] mailto --- email_reply_parser/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 51d8829..dcfda94 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -80,6 +80,7 @@ def de_support(self): '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + + '|' + 'mailto' '):\*? .+' ) self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' @@ -88,7 +89,7 @@ def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)') - self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') + self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject|mailto):\*? .+') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def set_regex(self): From 0d5085911c0be9c13a3037b889290212bbcf3a50 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 14:50:50 +0000 Subject: [PATCH 10/58] dir path --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index dcfda94..3663761 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -12,6 +12,7 @@ class EmailReplyParser(object): """ Represents a email message that is parsed. """ def __init__(self, language='en'): + dir_path = os.path.dirname(__file__) with open(dir_path + "/languages_support.json", "r") as read_file: self.words_map = json.load(read_file) self.language = language @@ -50,7 +51,6 @@ def __init__(self, text, language, words_map): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - dir_path = os.path.dirname(__file__) self.words_map = words_map self.language = language self.set_regex() From ea11d23299c431c0c113f02729fff35db9205e85 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 15:06:46 +0000 Subject: [PATCH 11/58] mailto --- email_reply_parser/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 3663761..f2b671a 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -80,8 +80,7 @@ def de_support(self): '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + - '|' + 'mailto' - '):\*? .+' + '):\*? .+)|mailto:.+' ) self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' From 83b2361425b4795cd1eb9280b50fe8cdca576038 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 16:19:45 +0000 Subject: [PATCH 12/58] mailto --- email_reply_parser/__init__.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index f2b671a..a4e9aef 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -62,7 +62,7 @@ def default_quoted_header(self): '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + - '):\*? .+' + '):\*? .+|.+(mailto:).+' ) def nl_support(self): @@ -74,21 +74,13 @@ def nl_support(self): def de_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') - self.QUOTED_REGEX = re.compile(r'(>+)') - self.HEADER_REGEX = re.compile( - r'^\*?(' + self.words_map[self.language]['From'] + - '|' + self.words_map[self.language]['Sent'] + - '|' + self.words_map[self.language]['To'] + - '|' + self.words_map[self.language]['Subject'] + - '):\*? .+)|mailto:.+' - ) self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)') - self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject|mailto):\*? .+') + self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+|.+(mailto:).+') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def set_regex(self): From 6393dfe7199cec2bbb737abb7ba221128a405f54 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 16:21:24 +0000 Subject: [PATCH 13/58] refactor --- email_reply_parser/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index a4e9aef..ddd2bbb 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -74,13 +74,14 @@ def nl_support(self): def de_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)') - self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+|.+(mailto:).+') + self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def set_regex(self): From ba88be340d9e1ce8a6709b5c5b32e00074afba5e Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 2 May 2019 02:23:17 +0100 Subject: [PATCH 14/58] french support --- .gitignore | 5 ++++- email_reply_parser/__init__.py | 11 ++++++++++- test/performance.py | 25 +++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 test/performance.py diff --git a/.gitignore b/.gitignore index 71f7c64..2fd0234 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ tests/.DS_Store .DS_Store *.egg-info .project +env/ +venv/ dist/ dist/* - +*.csv +__pycache__/ diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index ddd2bbb..91782bd 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -66,7 +66,7 @@ def default_quoted_header(self): ) def nl_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Verstuurd vanaf'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' @@ -77,6 +77,15 @@ def de_support(self): self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + def fr_support(self): + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Envoy\u00e9 depuis'] \ + + '(\w+\s*){1,3})|(Cordialement)' + ) + self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') diff --git a/test/performance.py b/test/performance.py new file mode 100644 index 0000000..75b6874 --- /dev/null +++ b/test/performance.py @@ -0,0 +1,25 @@ +import pandas as pd +import numpy as np +import time +# from bs4 import BeautifulSoup # requires lxml +from email_reply_parser import EmailReplyParser + + +def profile(): + df = pd.DataFrame.from_csv('test.csv') + ground = time.time() + content = df.content.values[np.argmax([len(d) for d in df.content.values])] + start = time.time() + parser = EmailReplyParser(language='fr') + print(str(time.time() - start) + 'init parser') + start = time.time() + res = parser.parse_reply(content) + print(str(time.time() - start) + 'parse') + start = time.time() + soup = BeautifulSoup(res, 'lxml') + text = soup.getText(' ') + print(str(time.time() - start) + 'soup') + print(f'Total time: {time.time() - ground}') + +if __name__ == '__main__': + profile() From 6227114c1e3c33f13f542c19aa6060ac82b9d6ac Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 2 May 2019 02:27:14 +0100 Subject: [PATCH 15/58] fix key --- email_reply_parser/__init__.py | 4 ++-- email_reply_parser/languages_support.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 91782bd..03aa65a 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -66,7 +66,7 @@ def default_quoted_header(self): ) def nl_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Verstuurd vanaf'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' @@ -79,7 +79,7 @@ def de_support(self): def fr_support(self): self.SIG_REGEX = re.compile( - r'(--|__|-\w)|(^' + self.words_map[self.language]['Envoy\u00e9 depuis'] \ + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ + '(\w+\s*){1,3})|(Cordialement)' ) self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') diff --git a/email_reply_parser/languages_support.json b/email_reply_parser/languages_support.json index aeedb2a..8d46c6a 100644 --- a/email_reply_parser/languages_support.json +++ b/email_reply_parser/languages_support.json @@ -32,7 +32,7 @@ "Subject": "Subject" }, "nl": { - "Sent from": "Verzonden met", + "Sent from": "Verstuurd vanaf", "From": "Van", "To": "Aan", "wrote": "schreef", @@ -159,4 +159,4 @@ "Sent": "\u05e0\u05e9\u05dc\u05d7", "Subject": "\u05e0\u05d5\u05e9\u05d0" } -} \ No newline at end of file +} From 8d3bf59ba974804870c5191db0a3cf0ff9852885 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 2 May 2019 03:15:20 +0100 Subject: [PATCH 16/58] cordialement regex --- email_reply_parser/__init__.py | 6 +++--- test/emails/emails.json | 4 ++++ test/performance.py | 19 ++++++++++++++++--- 3 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 test/emails/emails.json diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 03aa65a..ee0cb7e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -80,7 +80,7 @@ def de_support(self): def fr_support(self): self.SIG_REGEX = re.compile( r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ - + '(\w+\s*){1,3})|(Cordialement)' + + '(\w+\s*){1,3})|(.*[Cc]ordialement)' ) self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') self.default_quoted_header() @@ -153,8 +153,8 @@ def _scan_line(self, line): is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None - if self.fragment and len(line.strip()) == 0: - if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): + if self.fragment: + if self.SIG_REGEX.match(line.strip()): self.fragment.signature = True self._finish_fragment() diff --git a/test/emails/emails.json b/test/emails/emails.json new file mode 100644 index 0000000..13ecf83 --- /dev/null +++ b/test/emails/emails.json @@ -0,0 +1,4 @@ +[ + "
\n

 

\n

 

\n

Bonjour,

\n

 

\n

Merci de trouver, ci-joint, une nouvelle commande.

\n

 

\n

INTERLOCUTEUR : FABRICE GAUDIN

\n

 

\n

Merci d’accuser réception pour la commande référencée.

\n

 

\n

 

\n

Bien cordialement,

\n

 

\n

Person PERSON

\n

Acheteuse

\n

Direct : +111 11 11 11 11

\n

person.person@person.com\n

\n

 

\n

Bonjour - Lifting Businesses
www.konecranes.fr

\n

 

\n

Suivez-nous sur les réseaux sociaux :

\n

\"Twitter\"\"linkedIN\"\"Youtube\"\"Cropped

\n

 

\n

Konecranes France

\n

Route de Paris

\n

Bâtiment B

\n

28100 DREUX

\n

FRANCE

\n

 

\n

 

\n
\n
______________________________________________________________________
This email has been scanned by the Symantec Email Security.cloud service.
______________________________________________________________________
", + "Merci d'imprimer l'Appel d'Offre ci-joint et de le renvoyer signé dans l\n es 3 jours à votre correspondant achats (hors consultation achats Prototype).\n\n Cordialement\n -- Disclaimer ------------------------------------\n Ce message ainsi que les eventuelles pieces jointes constituent une correspondance privee et confidentielle a l'attention exclusive du destinataire designe ci-dessus. Si vous n'etes pas le destinataire du present message ou une personne susceptible de pouvoir le lui delivrer, il vous est signifie que toute divulgation, distribution ou copie de cette transmission est strictement interdite. Si vous avez recu ce message par erreur, nous vous remercions d'en informer l'expediteur par telephone ou de lui retourner le present message, puis d'effacer immediatement ce message de votre systeme.\n\n *** This e-mail and any attachments is a confidential correspondence intended only for use of the individual or entity named above. If you are not the intended recipient or the agent responsible for delivering the message to the intended recipient, you are hereby notified that any disclosure, distribution or copying of this communication is strictly prohibited. If you have received this communication in error, please notify the sender by phone or by replying this message, and then delete this message from your system.\n\n ______________________________________________________________________\n This email has been scanned by the System Email Security.cloud service.\n ______________________________________________________________________" +] diff --git a/test/performance.py b/test/performance.py index 75b6874..a325ceb 100644 --- a/test/performance.py +++ b/test/performance.py @@ -1,10 +1,10 @@ import pandas as pd import numpy as np +import json import time -# from bs4 import BeautifulSoup # requires lxml +from bs4 import BeautifulSoup # requires lxml from email_reply_parser import EmailReplyParser - def profile(): df = pd.DataFrame.from_csv('test.csv') ground = time.time() @@ -21,5 +21,18 @@ def profile(): print(str(time.time() - start) + 'soup') print(f'Total time: {time.time() - ground}') +def verify(): + parser = EmailReplyParser(language='fr') + texts = json.load(open('test/emails/emails.json')) + for text in texts: + # print(text) + soup = BeautifulSoup(text, 'lxml') + text = soup.getText(' ') + text = parser.parse_reply(text) + print(text) + + # print(text) + if __name__ == '__main__': - profile() + # profile() + verify() From c9b1fc7b2baede7ddde12d9ec88883d727bcc65f Mon Sep 17 00:00:00 2001 From: atc0m Date: Mon, 13 May 2019 16:02:15 +0100 Subject: [PATCH 17/58] french signatures --- .gitignore | 1 + email_reply_parser/__init__.py | 26 +++++++++++--------------- test/performance.py | 13 +++++++------ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 2fd0234..1e9728d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ dist/ dist/* *.csv __pycache__/ +customer_emails.json diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index ee0cb7e..fffc991 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,6 +1,5 @@ """ email_reply_parser is a python library port of GitHub's Email Reply Parser. - For more information, visit https://github.com/zapier/email_reply_parser """ import os @@ -15,25 +14,24 @@ def __init__(self, language='en'): dir_path = os.path.dirname(__file__) with open(dir_path + "/languages_support.json", "r") as read_file: self.words_map = json.load(read_file) - self.language = language + if language in self.words_map: + self.language = language + else: + self.language = 'en' def read(self, text): """ Factory method that splits email into list of fragments - text - A string email body - Returns an EmailMessage instance """ return EmailMessage(text, self.language, self.words_map).read() def parse_reply(self, text): """ Provides the reply portion of email. - text - A string email body - Returns reply body message """ - return self.read(text).reply + return self.read(text.replace('\xa0', ' ')).reply class EmailMessage(object): @@ -62,7 +60,7 @@ def default_quoted_header(self): '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + - '):\*? .+|.+(mailto:).+' + ')\s*:\*? .+|.+(mailto:).+' ) def nl_support(self): @@ -80,11 +78,13 @@ def de_support(self): def fr_support(self): self.SIG_REGEX = re.compile( r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ - + '(\w+\s*){1,3})|(.*[Cc]ordialement)' + + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations|cdlt|cdt|crdt|regards|best regard|' + 'bonne journ[ée]e))', + re.IGNORECASE ) - self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*>:$') self.default_quoted_header() - self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit.*>:)(Le\s(.+?)a écrit.*>:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') @@ -108,7 +108,6 @@ def set_regex(self): def read(self): """ Creates new fragment for each line and labels as a signature, quote, or hidden. - Returns EmailMessage instance """ @@ -146,7 +145,6 @@ def reply(self): def _scan_line(self, line): """ Reviews each line in email message and determines fragment type - line - a row of text from an email message """ is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None @@ -169,9 +167,7 @@ def _scan_line(self, line): def quote_header(self, line): """ Determines whether line is part of a quoted area - line - a row of the email message - Returns True or False """ return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None diff --git a/test/performance.py b/test/performance.py index a325ceb..ae722be 100644 --- a/test/performance.py +++ b/test/performance.py @@ -1,5 +1,5 @@ -import pandas as pd -import numpy as np +# import pandas as pd +# import numpy as np import json import time from bs4 import BeautifulSoup # requires lxml @@ -24,14 +24,15 @@ def profile(): def verify(): parser = EmailReplyParser(language='fr') texts = json.load(open('test/emails/emails.json')) + texts = list(filter(lambda d: type(d) == str, texts)) + parsed = [] for text in texts: # print(text) soup = BeautifulSoup(text, 'lxml') text = soup.getText(' ') - text = parser.parse_reply(text) - print(text) - - # print(text) + parsed.append(text) + #text = parser.parse_reply(text) + #print(text) if __name__ == '__main__': # profile() From d1915e64ae6c6aaec7ad32e9dcf06bd0082fec1e Mon Sep 17 00:00:00 2001 From: atc0m Date: Tue, 14 May 2019 00:41:39 +0100 Subject: [PATCH 18/58] ignore empty lines --- email_reply_parser/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index fffc991..bef617a 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -24,14 +24,14 @@ def read(self, text): text - A string email body Returns an EmailMessage instance """ - return EmailMessage(text, self.language, self.words_map).read() + return EmailMessage(text.replace('\xa0', ' '), self.language, self.words_map).read() def parse_reply(self, text): """ Provides the reply portion of email. text - A string email body Returns reply body message """ - return self.read(text.replace('\xa0', ' ')).reply + return self.read(text).reply class EmailMessage(object): @@ -125,7 +125,8 @@ def read(self): self.lines.reverse() for line in self.lines: - self._scan_line(line) + if line.strip(): + self._scan_line(line) self._finish_fragment() From 7b28cc1c7e331012132d2cf170971f8f5c2a9c5e Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 4 Sep 2019 21:14:38 +0100 Subject: [PATCH 19/58] correct translation --- email_reply_parser/__init__.py | 14 +++++++++----- email_reply_parser/languages_support.json | 8 ++++---- test/emails/emails.json | 2 -- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index bef617a..f078904 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -51,6 +51,7 @@ def __init__(self, text, language, words_map): self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None self.words_map = words_map self.language = language + self.default_language = 'en' self.set_regex() def default_quoted_header(self): @@ -66,13 +67,11 @@ def default_quoted_header(self): def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') - self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' def de_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') - self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' def fr_support(self): @@ -83,21 +82,26 @@ def fr_support(self): re.IGNORECASE ) self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*>:$') - self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit.*>:)(Le\s(.+?)a écrit.*>:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)') - self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + + def fi_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + self.QUOTED_REGEX = re.compile(r'(>+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def set_regex(self): if hasattr(self, self.language+"_support"): getattr(self, self.language+"_support")() + self.default_quoted_header() else: - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + '|' self.words_map[self.default_language]['Sent from'] + ')(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ diff --git a/email_reply_parser/languages_support.json b/email_reply_parser/languages_support.json index 8d46c6a..11fbaf4 100644 --- a/email_reply_parser/languages_support.json +++ b/email_reply_parser/languages_support.json @@ -121,11 +121,11 @@ }, "fi": { "Sent from": "L\u00e4hetetty", - "From": "alkaen", - "To": "jotta", + "From": "L\u00e4hett\u00e4j\u00e4", + "To": "Vastaanottaja", "wrote": "kirjoitti", - "Sent": "L\u00e4hetetyt", - "Subject": "aihe" + "Sent": "L\u00e4hetetty", + "Subject": "Aihe" }, "zh": { "Sent from": "\u6765\u81ea", diff --git a/test/emails/emails.json b/test/emails/emails.json index 13ecf83..0d4f101 100644 --- a/test/emails/emails.json +++ b/test/emails/emails.json @@ -1,4 +1,2 @@ [ - "
\n

 

\n

 

\n

Bonjour,

\n

 

\n

Merci de trouver, ci-joint, une nouvelle commande.

\n

 

\n

INTERLOCUTEUR : FABRICE GAUDIN

\n

 

\n

Merci d’accuser réception pour la commande référencée.

\n

 

\n

 

\n

Bien cordialement,

\n

 

\n

Person PERSON

\n

Acheteuse

\n

Direct : +111 11 11 11 11

\n

person.person@person.com\n

\n

 

\n

Bonjour - Lifting Businesses
www.konecranes.fr

\n

 

\n

Suivez-nous sur les réseaux sociaux :

\n

\"Twitter\"\"linkedIN\"\"Youtube\"\"Cropped

\n

 

\n

Konecranes France

\n

Route de Paris

\n

Bâtiment B

\n

28100 DREUX

\n

FRANCE

\n

 

\n

 

\n
\n
______________________________________________________________________
This email has been scanned by the Symantec Email Security.cloud service.
______________________________________________________________________
", - "Merci d'imprimer l'Appel d'Offre ci-joint et de le renvoyer signé dans l\n es 3 jours à votre correspondant achats (hors consultation achats Prototype).\n\n Cordialement\n -- Disclaimer ------------------------------------\n Ce message ainsi que les eventuelles pieces jointes constituent une correspondance privee et confidentielle a l'attention exclusive du destinataire designe ci-dessus. Si vous n'etes pas le destinataire du present message ou une personne susceptible de pouvoir le lui delivrer, il vous est signifie que toute divulgation, distribution ou copie de cette transmission est strictement interdite. Si vous avez recu ce message par erreur, nous vous remercions d'en informer l'expediteur par telephone ou de lui retourner le present message, puis d'effacer immediatement ce message de votre systeme.\n\n *** This e-mail and any attachments is a confidential correspondence intended only for use of the individual or entity named above. If you are not the intended recipient or the agent responsible for delivering the message to the intended recipient, you are hereby notified that any disclosure, distribution or copying of this communication is strictly prohibited. If you have received this communication in error, please notify the sender by phone or by replying this message, and then delete this message from your system.\n\n ______________________________________________________________________\n This email has been scanned by the System Email Security.cloud service.\n ______________________________________________________________________" ] From 6fc5df3ba43735a97c310858f018f68960501a50 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Sep 2019 02:39:56 +0100 Subject: [PATCH 20/58] multiple signatures --- email_reply_parser/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index f078904..44f4059 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,6 +1,6 @@ """ - email_reply_parser is a python library port of GitHub's Email Reply Parser. - For more information, visit https://github.com/zapier/email_reply_parser +email_reply_parser is a python library port of GitHub's Email Reply Parser. +For more information, visit https://github.com/zapier/email_reply_parser """ import os import re @@ -91,7 +91,7 @@ def en_support(self): self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def fi_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' @@ -101,7 +101,7 @@ def set_regex(self): getattr(self, self.language+"_support")() self.default_quoted_header() else: - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + '|' self.words_map[self.default_language]['Sent from'] + ')(\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + '|' + self.words_map[self.default_language]['Sent from'] + ')(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ From 6cc9442e6fd195b5272579330af8f67a9a6836a8 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Sep 2019 22:18:25 +0100 Subject: [PATCH 21/58] multi quote header rgx --- email_reply_parser/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 44f4059..757afbf 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -92,9 +92,9 @@ def en_support(self): def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') - self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + self.QUOTE_HDR_REGEX = re.compile('.*kirjoitti:$') self.QUOTED_REGEX = re.compile(r'(>+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + self._MULTI_QUOTE_HDR_REGEX = r'(.*kirjoitti:)' def set_regex(self): if hasattr(self, self.language+"_support"): From 2d600205cd92f5a15c4fbd7a2346c2a020561d0d Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 6 Sep 2019 00:04:32 +0100 Subject: [PATCH 22/58] html escaped brackets --- email_reply_parser/__init__.py | 10 +++++----- test/emails/emails.json | 6 ++++-- test/performance.py | 9 +++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 757afbf..225ca6e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -24,7 +24,7 @@ def read(self, text): text - A string email body Returns an EmailMessage instance """ - return EmailMessage(text.replace('\xa0', ' '), self.language, self.words_map).read() + return EmailMessage(text, self.language, self.words_map).read() def parse_reply(self, text): """ Provides the reply portion of email. @@ -87,14 +87,14 @@ def fr_support(self): def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') - self.QUOTED_REGEX = re.compile(r'(>+)') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') - self.QUOTE_HDR_REGEX = re.compile('.*kirjoitti:$') - self.QUOTED_REGEX = re.compile(r'(>+)') - self._MULTI_QUOTE_HDR_REGEX = r'(.*kirjoitti:)' + self.QUOTE_HDR_REGEX = re.compile('(.*kirjoitti:$)|([a-zA-Z0-9.:;<>& ]+?kirjoitti[a-zA-Z0-9.:;<>& ]+?kello(.+?):$)') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>& ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>& ]*:$)([a-zA-Z0-9.:;<>& ]+?kirjoitti[a-zA-Z0-9.:;<>& ]+?:$)' def set_regex(self): if hasattr(self, self.language+"_support"): diff --git a/test/emails/emails.json b/test/emails/emails.json index 0d4f101..0371ace 100644 --- a/test/emails/emails.json +++ b/test/emails/emails.json @@ -1,2 +1,4 @@ -[ -] +["Hei \n\n T\u00e4m\u00e4 tuote (Lenovo IdeaPad 120S 14\" kannettava) on edelleenkin\n\nk\u00e4ytt\u00f6kelvoton. T\u00e4n\u00e4\u00e4n j\u00e4\u00e4tyi t\u00e4ysin asiakaspalaverissa monta kertaa,\n\nvain virtanapin painaminen pitk\u00e4\u00e4n pohjaan auttoi\n\nuudelleenk\u00e4ynnistykseen. \u00c4\u00e4rimm\u00e4isen kiusallista. \n\n Haluan rahat takaisin t\u00e4st\u00e4 ep\u00e4onnistuneesta ostoksesta. \n\n Arvostaisin, jos kertoisitte miten asiassa edet\u00e4\u00e4n. \n\n Parhain terveisin, \n\n *Matti Tommiska* \n\n CEO, Co-Founder\n\nXiphera Ltd.\n\nOtakaari 5, FIN-02150\n\nEspoo, Finland \n\n +358 40 541 0981\n\nmatti.tommiska@xiphera.com <mailto:matti.tommiska@xiphera.com> \n\n <http://www.xiphera.com>\n\nOn 3.6.2019 15.06, Verkkokauppa.com wrote:\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n>\n\n> Verkkokauppa.com\n\n> <https://www.verkkokauppa.com/?utm_source=headerlink&utm_medium=orderemail&utm_campaign=serviceorder_ready>\n\n>\n\n>\n\n>\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n> Hyv\u00e4 asiakkaamme,\n", +"Hei, \n\n Milloin teille tulee myyntii tuo Elfen Universal Stand -teline kannettaville- sek\u00e4 tablet-tietokoneille? \n\n Terveisin\n\nArto Mehto \n\n Hanki Outlook for Android<https://aka.ms/ghei36>\ntest test\nmore", +"Kiitos ja pahoittelut sekoilusta. \n\n Viikonloppuja! \n\n Minna \n\n Lähetetty iPhonesta \n\n > Verkkokauppa.com asiakaspalvelu <asiakaspalvelu@verkkokauppa.com> kirjoitti 23.8.2019 kello 11.10:", +"T\u00e4m\u00e4 on jatkoa edelliselle pyynn\u00f6lle #2457330 Helkama Saimi \n\n Verkkokauppa.com asiakaspalvelu kirjoitti 20.05.2019 kello 10:26:\n\nHuomenia!\n\nM\u00e4 nytt en en\u00e4\u00e4 tied\u00e4 mit\u00e4 teen tuon py\u00f6r\u00e4n kanssa :(\n"] diff --git a/test/performance.py b/test/performance.py index ae722be..ee6a271 100644 --- a/test/performance.py +++ b/test/performance.py @@ -22,17 +22,18 @@ def profile(): print(f'Total time: {time.time() - ground}') def verify(): - parser = EmailReplyParser(language='fr') + parser = EmailReplyParser(language='fi') texts = json.load(open('test/emails/emails.json')) texts = list(filter(lambda d: type(d) == str, texts)) parsed = [] for text in texts: # print(text) soup = BeautifulSoup(text, 'lxml') - text = soup.getText(' ') + text = soup.getText('\n') + print(text) + text = parser.parse_reply(text) parsed.append(text) - #text = parser.parse_reply(text) - #print(text) + print(text) if __name__ == '__main__': # profile() From 5c2e1cee49da6c113c7846e30d8716e1fd55fb4d Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 6 Sep 2019 00:16:03 +0100 Subject: [PATCH 23/58] multi quote fix --- email_reply_parser/__init__.py | 4 ++-- test/emails/emails.json | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 225ca6e..ac14bcb 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -92,9 +92,9 @@ def en_support(self): def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') - self.QUOTE_HDR_REGEX = re.compile('(.*kirjoitti:$)|([a-zA-Z0-9.:;<>& ]+?kirjoitti[a-zA-Z0-9.:;<>& ]+?kello(.+?):$)') + self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello(.+?))?:$)') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>& ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>& ]*:$)([a-zA-Z0-9.:;<>& ]+?kirjoitti[a-zA-Z0-9.:;<>& ]+?:$)' + self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>&@ ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>&@ ]*:$)([a-zA-Z0-9.:;<>&@ ]+?kirjoitti[a-zA-Z0-9.:;<>&@ ]*:$)' def set_regex(self): if hasattr(self, self.language+"_support"): diff --git a/test/emails/emails.json b/test/emails/emails.json index 0371ace..ebd0e26 100644 --- a/test/emails/emails.json +++ b/test/emails/emails.json @@ -1,4 +1,5 @@ ["Hei \n\n T\u00e4m\u00e4 tuote (Lenovo IdeaPad 120S 14\" kannettava) on edelleenkin\n\nk\u00e4ytt\u00f6kelvoton. T\u00e4n\u00e4\u00e4n j\u00e4\u00e4tyi t\u00e4ysin asiakaspalaverissa monta kertaa,\n\nvain virtanapin painaminen pitk\u00e4\u00e4n pohjaan auttoi\n\nuudelleenk\u00e4ynnistykseen. \u00c4\u00e4rimm\u00e4isen kiusallista. \n\n Haluan rahat takaisin t\u00e4st\u00e4 ep\u00e4onnistuneesta ostoksesta. \n\n Arvostaisin, jos kertoisitte miten asiassa edet\u00e4\u00e4n. \n\n Parhain terveisin, \n\n *Matti Tommiska* \n\n CEO, Co-Founder\n\nXiphera Ltd.\n\nOtakaari 5, FIN-02150\n\nEspoo, Finland \n\n +358 40 541 0981\n\nmatti.tommiska@xiphera.com <mailto:matti.tommiska@xiphera.com> \n\n <http://www.xiphera.com>\n\nOn 3.6.2019 15.06, Verkkokauppa.com wrote:\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n>\n\n> Verkkokauppa.com\n\n> <https://www.verkkokauppa.com/?utm_source=headerlink&utm_medium=orderemail&utm_campaign=serviceorder_ready>\n\n>\n\n>\n\n>\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n> Hyv\u00e4 asiakkaamme,\n", "Hei, \n\n Milloin teille tulee myyntii tuo Elfen Universal Stand -teline kannettaville- sek\u00e4 tablet-tietokoneille? \n\n Terveisin\n\nArto Mehto \n\n Hanki Outlook for Android<https://aka.ms/ghei36>\ntest test\nmore", "Kiitos ja pahoittelut sekoilusta. \n\n Viikonloppuja! \n\n Minna \n\n Lähetetty iPhonesta \n\n > Verkkokauppa.com asiakaspalvelu <asiakaspalvelu@verkkokauppa.com> kirjoitti 23.8.2019 kello 11.10:", -"T\u00e4m\u00e4 on jatkoa edelliselle pyynn\u00f6lle #2457330 Helkama Saimi \n\n Verkkokauppa.com asiakaspalvelu kirjoitti 20.05.2019 kello 10:26:\n\nHuomenia!\n\nM\u00e4 nytt en en\u00e4\u00e4 tied\u00e4 mit\u00e4 teen tuon py\u00f6r\u00e4n kanssa :(\n"] +"T\u00e4m\u00e4 on jatkoa edelliselle pyynn\u00f6lle #2457330 Helkama Saimi \n\n Verkkokauppa.com asiakaspalvelu kirjoitti 20.05.2019 kello 10:26:\n\nHuomenia!\n\nM\u00e4 nytt en en\u00e4\u00e4 tied\u00e4 mit\u00e4 teen tuon py\u00f6r\u00e4n kanssa :(\n", +"Hei,\n\nTilaukseni tulee noutamaan Riikka Eklund.\n\nH\u00e4nell\u00e4 mukana tilausnumero \n\n Kiitos ja terveisin\n\nRobert Eklund \n\n Verkkokauppa.com kirjoitti 08.06.2019 kello 10:52:\n\n> Hyv\u00e4 asiakkaamme,\n\n> \n\n> Tilauksesi 60043384 on nyt valmiina.\n\n> Jos tilausta ei noudeta, peruntuu se automaattisesti lauantaina\n\n> 15.06.2019.\n\n> \n\n> Tilauksen maksutavan ollessa korttimaksu verkossa tilauksen voi\n\n> noutaa my\u00f6s muu kuin tilausvahvistuksessa mainittu henkil\u00f6.\n\n> Ilmoitathan t\u00e4ll\u00f6in meille sen henkil\u00f6n nimen, joka tilauksen noutaa.\n\n> Huomioithan, ett\u00e4 noudettaessa on esitett\u00e4v\u00e4 henkil\u00f6llisyystodistus\n\n> (passi, ajokortti tai henkil\u00f6kortti).\n\n> \n\n> \n\n> Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\n> https://www.verkkokauppa.com/fi/orders/view/60043384\n\n> \n\n> \n\n> ** Tilauksesi tuotteet:\n\n> ------------------------------------------------------------\n\n> \n\n> # Noudettavissa Oulun myym\u00e4l\u00e4st\u00e4:\n\n> \n\n> Gecko Easyclick Cover Lenovo Tab 4 10\" -&gojakotelo, musta"] From 80d0bf8eced8a1dfc7338d37bff13aa6f7a05f14 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 6 Sep 2019 00:29:13 +0100 Subject: [PATCH 24/58] syntax --- email_reply_parser/__init__.py | 4 +++- test/performance.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index ac14bcb..5527d6f 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -94,7 +94,7 @@ def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello(.+?))?:$)') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>&@ ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>&@ ]*:$)([a-zA-Z0-9.:;<>&@ ]+?kirjoitti[a-zA-Z0-9.:;<>&@ ]*:$)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!(.+?)kirjoitti(.+?)kirjoitti.*:$)((.+?)kirjoitti.*:$)' def set_regex(self): if hasattr(self, self.language+"_support"): @@ -119,6 +119,8 @@ def read(self): is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: + import code + code.interact(local=locals()) self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) # Fix any outlook style replies, with the reply immediately above the signature boundary line diff --git a/test/performance.py b/test/performance.py index ee6a271..5c21f40 100644 --- a/test/performance.py +++ b/test/performance.py @@ -36,5 +36,4 @@ def verify(): print(text) if __name__ == '__main__': - # profile() verify() From 44c3b523da717bd0a8f979212ad83e28e2d8ba30 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 6 Sep 2019 18:23:12 +0100 Subject: [PATCH 25/58] selected character s --- email_reply_parser/__init__.py | 9 ++++++--- test/emails/emails.json | 3 ++- test/performance.py | 14 +++++++------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 5527d6f..fe2c26f 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -57,7 +57,7 @@ def __init__(self, text, language, words_map): def default_quoted_header(self): self.QUOTED_REGEX = re.compile(r'(>+)') self.HEADER_REGEX = re.compile( - r'^\*?(' + self.words_map[self.language]['From'] + + r'^\[* ]?(' + self.words_map[self.language]['From'] + '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + @@ -94,7 +94,7 @@ def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello(.+?))?:$)') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?!(.+?)kirjoitti(.+?)kirjoitti.*:$)((.+?)kirjoitti.*:$)' + self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>&@ ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>&@ ]*?:$)([a-zA-Z0-9.:;<>&@ ]+?kirjoitti[a-zA-Z0-9.:;<>&@ ]*?:$)' def set_regex(self): if hasattr(self, self.language+"_support"): @@ -119,6 +119,7 @@ def read(self): is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: + print('-'*100 + '\nMULTI_QUOTE_HDR_REGEX_MULTILINE\n' + '-'*100) import code code.interact(local=locals()) self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) @@ -157,7 +158,9 @@ def _scan_line(self, line): is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None - + if is_quote_header: + import code + code.interact(local=locals()) if self.fragment: if self.SIG_REGEX.match(line.strip()): self.fragment.signature = True diff --git a/test/emails/emails.json b/test/emails/emails.json index ebd0e26..e6ad778 100644 --- a/test/emails/emails.json +++ b/test/emails/emails.json @@ -2,4 +2,5 @@ "Hei, \n\n Milloin teille tulee myyntii tuo Elfen Universal Stand -teline kannettaville- sek\u00e4 tablet-tietokoneille? \n\n Terveisin\n\nArto Mehto \n\n Hanki Outlook for Android<https://aka.ms/ghei36>\ntest test\nmore", "Kiitos ja pahoittelut sekoilusta. \n\n Viikonloppuja! \n\n Minna \n\n Lähetetty iPhonesta \n\n > Verkkokauppa.com asiakaspalvelu <asiakaspalvelu@verkkokauppa.com> kirjoitti 23.8.2019 kello 11.10:", "T\u00e4m\u00e4 on jatkoa edelliselle pyynn\u00f6lle #2457330 Helkama Saimi \n\n Verkkokauppa.com asiakaspalvelu kirjoitti 20.05.2019 kello 10:26:\n\nHuomenia!\n\nM\u00e4 nytt en en\u00e4\u00e4 tied\u00e4 mit\u00e4 teen tuon py\u00f6r\u00e4n kanssa :(\n", -"Hei,\n\nTilaukseni tulee noutamaan Riikka Eklund.\n\nH\u00e4nell\u00e4 mukana tilausnumero \n\n Kiitos ja terveisin\n\nRobert Eklund \n\n Verkkokauppa.com kirjoitti 08.06.2019 kello 10:52:\n\n> Hyv\u00e4 asiakkaamme,\n\n> \n\n> Tilauksesi 60043384 on nyt valmiina.\n\n> Jos tilausta ei noudeta, peruntuu se automaattisesti lauantaina\n\n> 15.06.2019.\n\n> \n\n> Tilauksen maksutavan ollessa korttimaksu verkossa tilauksen voi\n\n> noutaa my\u00f6s muu kuin tilausvahvistuksessa mainittu henkil\u00f6.\n\n> Ilmoitathan t\u00e4ll\u00f6in meille sen henkil\u00f6n nimen, joka tilauksen noutaa.\n\n> Huomioithan, ett\u00e4 noudettaessa on esitett\u00e4v\u00e4 henkil\u00f6llisyystodistus\n\n> (passi, ajokortti tai henkil\u00f6kortti).\n\n> \n\n> \n\n> Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\n> https://www.verkkokauppa.com/fi/orders/view/60043384\n\n> \n\n> \n\n> ** Tilauksesi tuotteet:\n\n> ------------------------------------------------------------\n\n> \n\n> # Noudettavissa Oulun myym\u00e4l\u00e4st\u00e4:\n\n> \n\n> Gecko Easyclick Cover Lenovo Tab 4 10\" -&gojakotelo, musta"] +"Hei,\n\nTilaukseni tulee noutamaan Riikka Eklund.\n\nH\u00e4nell\u00e4 mukana tilausnumero \n\n Kiitos ja terveisin\n\nRobert Eklund \n\n Verkkokauppa.com kirjoitti 08.06.2019 kello 10:52:\n\n> Hyv\u00e4 asiakkaamme,\n\n> \n\n> Tilauksesi 60043384 on nyt valmiina.\n\n> Jos tilausta ei noudeta, peruntuu se automaattisesti lauantaina\n\n> 15.06.2019.\n\n> \n\n> Tilauksen maksutavan ollessa korttimaksu verkossa tilauksen voi\n\n> noutaa my\u00f6s muu kuin tilausvahvistuksessa mainittu henkil\u00f6.\n\n> Ilmoitathan t\u00e4ll\u00f6in meille sen henkil\u00f6n nimen, joka tilauksen noutaa.\n\n> Huomioithan, ett\u00e4 noudettaessa on esitett\u00e4v\u00e4 henkil\u00f6llisyystodistus\n\n> (passi, ajokortti tai henkil\u00f6kortti).\n\n> \n\n> \n\n> Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\n> https://www.verkkokauppa.com/fi/orders/view/60043384\n\n> \n\n> \n\n> ** Tilauksesi tuotteet:\n\n> ------------------------------------------------------------\n\n> \n\n> # Noudettavissa Oulun myym\u00e4l\u00e4st\u00e4:\n\n> \n\n> Gecko Easyclick Cover Lenovo Tab 4 10\" -&gojakotelo, musta", +"Hei, \n\n Sain tilauksen 60019114 kotiini t\u00e4n\u00e4\u00e4n ja huomasin ett\u00e4 toinen tilaamistani kaapeleista on siten viallinen, ett\u00e4 toinen audiol\u00e4ht\u00f6 ja maa ovat yhteydess\u00e4, eli kaapeli on osittain oikosulussa. Voitteko l\u00e4hett\u00e4\u00e4 minulle uuden Fuj:tech 2,5 mm uros - 2,5 mm naaras -audiokaapelin tuon valmiiksi rikkin\u00e4isen tilalle? \n\n Yst\u00e4v\u00e4llisin terveisin:\n\nKonsta Leino \n\n L\u00e4hett\u00e4j\u00e4: Verkkokauppa.com <asiakaspalvelu@verkkokauppa.com>\n\nL\u00e4hetetty: keskiviikko 5. kes\u00e4kuuta 2019 11.38\n\nVastaanottaja: konsta.leino9@hotmail.com\n\nAihe: Kiitos tilauksestasi - 60019114 \n\n [Verkkokauppa.com] <https://www.verkkokauppa.com/?utm_source=headerlink&utm_medium=orderemail&utm_campaign=order_created>\n\nKiitos tilauksestasi - 60019114 \n\n T\u00e4m\u00e4 on automaattinen vahvistusviesti tilauksestasi. Tilausnumerosi on 60019114. Mik\u00e4li huomaat virheit\u00e4 l\u00e4hetysluettelossa tai sinulla on kysytt\u00e4v\u00e4\u00e4, otathan yhteytt\u00e4 asiakaspalveluumme. \n\n Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\nhttps://www.verkkokauppa.com/fi/orders/view/60019114<https://www.verkkokauppa.com/fi/orders/view/60019114?utm_source=orderlink&utm_medium=orderemail&utm_campaign=order_created> \n\n [https://cdn-b.verkkokauppa.com/45/images/33/2_106137-400x400.jpg]"] diff --git a/test/performance.py b/test/performance.py index 5c21f40..35f2c31 100644 --- a/test/performance.py +++ b/test/performance.py @@ -27,13 +27,13 @@ def verify(): texts = list(filter(lambda d: type(d) == str, texts)) parsed = [] for text in texts: - # print(text) - soup = BeautifulSoup(text, 'lxml') - text = soup.getText('\n') - print(text) - text = parser.parse_reply(text) - parsed.append(text) - print(text) + if text.startswith('Kiitos'): + print('-'*100) + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + print(text) if __name__ == '__main__': verify() From 1a9dc5231ce7a004546d9cf6c046cdd9e7c977da Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 6 Sep 2019 18:25:41 +0100 Subject: [PATCH 26/58] rm --- email_reply_parser/__init__.py | 6 ------ test/emails/emails.json | 7 +------ test/performance.py | 13 ++++++------- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index fe2c26f..ed50e0e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -119,9 +119,6 @@ def read(self): is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: - print('-'*100 + '\nMULTI_QUOTE_HDR_REGEX_MULTILINE\n' + '-'*100) - import code - code.interact(local=locals()) self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) # Fix any outlook style replies, with the reply immediately above the signature boundary line @@ -158,9 +155,6 @@ def _scan_line(self, line): is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None - if is_quote_header: - import code - code.interact(local=locals()) if self.fragment: if self.SIG_REGEX.match(line.strip()): self.fragment.signature = True diff --git a/test/emails/emails.json b/test/emails/emails.json index e6ad778..fe51488 100644 --- a/test/emails/emails.json +++ b/test/emails/emails.json @@ -1,6 +1 @@ -["Hei \n\n T\u00e4m\u00e4 tuote (Lenovo IdeaPad 120S 14\" kannettava) on edelleenkin\n\nk\u00e4ytt\u00f6kelvoton. T\u00e4n\u00e4\u00e4n j\u00e4\u00e4tyi t\u00e4ysin asiakaspalaverissa monta kertaa,\n\nvain virtanapin painaminen pitk\u00e4\u00e4n pohjaan auttoi\n\nuudelleenk\u00e4ynnistykseen. \u00c4\u00e4rimm\u00e4isen kiusallista. \n\n Haluan rahat takaisin t\u00e4st\u00e4 ep\u00e4onnistuneesta ostoksesta. \n\n Arvostaisin, jos kertoisitte miten asiassa edet\u00e4\u00e4n. \n\n Parhain terveisin, \n\n *Matti Tommiska* \n\n CEO, Co-Founder\n\nXiphera Ltd.\n\nOtakaari 5, FIN-02150\n\nEspoo, Finland \n\n +358 40 541 0981\n\nmatti.tommiska@xiphera.com <mailto:matti.tommiska@xiphera.com> \n\n <http://www.xiphera.com>\n\nOn 3.6.2019 15.06, Verkkokauppa.com wrote:\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n>\n\n> Verkkokauppa.com\n\n> <https://www.verkkokauppa.com/?utm_source=headerlink&utm_medium=orderemail&utm_campaign=serviceorder_ready>\n\n>\n\n>\n\n>\n\n> Huoltotapauksesi on noudettavissa - 59389042\n\n>\n\n> Hyv\u00e4 asiakkaamme,\n", -"Hei, \n\n Milloin teille tulee myyntii tuo Elfen Universal Stand -teline kannettaville- sek\u00e4 tablet-tietokoneille? \n\n Terveisin\n\nArto Mehto \n\n Hanki Outlook for Android<https://aka.ms/ghei36>\ntest test\nmore", -"Kiitos ja pahoittelut sekoilusta. \n\n Viikonloppuja! \n\n Minna \n\n Lähetetty iPhonesta \n\n > Verkkokauppa.com asiakaspalvelu <asiakaspalvelu@verkkokauppa.com> kirjoitti 23.8.2019 kello 11.10:", -"T\u00e4m\u00e4 on jatkoa edelliselle pyynn\u00f6lle #2457330 Helkama Saimi \n\n Verkkokauppa.com asiakaspalvelu kirjoitti 20.05.2019 kello 10:26:\n\nHuomenia!\n\nM\u00e4 nytt en en\u00e4\u00e4 tied\u00e4 mit\u00e4 teen tuon py\u00f6r\u00e4n kanssa :(\n", -"Hei,\n\nTilaukseni tulee noutamaan Riikka Eklund.\n\nH\u00e4nell\u00e4 mukana tilausnumero \n\n Kiitos ja terveisin\n\nRobert Eklund \n\n Verkkokauppa.com kirjoitti 08.06.2019 kello 10:52:\n\n> Hyv\u00e4 asiakkaamme,\n\n> \n\n> Tilauksesi 60043384 on nyt valmiina.\n\n> Jos tilausta ei noudeta, peruntuu se automaattisesti lauantaina\n\n> 15.06.2019.\n\n> \n\n> Tilauksen maksutavan ollessa korttimaksu verkossa tilauksen voi\n\n> noutaa my\u00f6s muu kuin tilausvahvistuksessa mainittu henkil\u00f6.\n\n> Ilmoitathan t\u00e4ll\u00f6in meille sen henkil\u00f6n nimen, joka tilauksen noutaa.\n\n> Huomioithan, ett\u00e4 noudettaessa on esitett\u00e4v\u00e4 henkil\u00f6llisyystodistus\n\n> (passi, ajokortti tai henkil\u00f6kortti).\n\n> \n\n> \n\n> Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\n> https://www.verkkokauppa.com/fi/orders/view/60043384\n\n> \n\n> \n\n> ** Tilauksesi tuotteet:\n\n> ------------------------------------------------------------\n\n> \n\n> # Noudettavissa Oulun myym\u00e4l\u00e4st\u00e4:\n\n> \n\n> Gecko Easyclick Cover Lenovo Tab 4 10\" -&gojakotelo, musta", -"Hei, \n\n Sain tilauksen 60019114 kotiini t\u00e4n\u00e4\u00e4n ja huomasin ett\u00e4 toinen tilaamistani kaapeleista on siten viallinen, ett\u00e4 toinen audiol\u00e4ht\u00f6 ja maa ovat yhteydess\u00e4, eli kaapeli on osittain oikosulussa. Voitteko l\u00e4hett\u00e4\u00e4 minulle uuden Fuj:tech 2,5 mm uros - 2,5 mm naaras -audiokaapelin tuon valmiiksi rikkin\u00e4isen tilalle? \n\n Yst\u00e4v\u00e4llisin terveisin:\n\nKonsta Leino \n\n L\u00e4hett\u00e4j\u00e4: Verkkokauppa.com <asiakaspalvelu@verkkokauppa.com>\n\nL\u00e4hetetty: keskiviikko 5. kes\u00e4kuuta 2019 11.38\n\nVastaanottaja: konsta.leino9@hotmail.com\n\nAihe: Kiitos tilauksestasi - 60019114 \n\n [Verkkokauppa.com] <https://www.verkkokauppa.com/?utm_source=headerlink&utm_medium=orderemail&utm_campaign=order_created>\n\nKiitos tilauksestasi - 60019114 \n\n T\u00e4m\u00e4 on automaattinen vahvistusviesti tilauksestasi. Tilausnumerosi on 60019114. Mik\u00e4li huomaat virheit\u00e4 l\u00e4hetysluettelossa tai sinulla on kysytt\u00e4v\u00e4\u00e4, otathan yhteytt\u00e4 asiakaspalveluumme. \n\n Voit seurata tilauksesi tilaa verkkosivuiltamme:\n\nhttps://www.verkkokauppa.com/fi/orders/view/60019114<https://www.verkkokauppa.com/fi/orders/view/60019114?utm_source=orderlink&utm_medium=orderemail&utm_campaign=order_created> \n\n [https://cdn-b.verkkokauppa.com/45/images/33/2_106137-400x400.jpg]"] +[] diff --git a/test/performance.py b/test/performance.py index 35f2c31..424574e 100644 --- a/test/performance.py +++ b/test/performance.py @@ -27,13 +27,12 @@ def verify(): texts = list(filter(lambda d: type(d) == str, texts)) parsed = [] for text in texts: - if text.startswith('Kiitos'): - print('-'*100) - soup = BeautifulSoup(text, 'lxml') - text = soup.getText('\n') - text = parser.parse_reply(text) - parsed.append(text) - print(text) + print('-'*100) + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + print(text) if __name__ == '__main__': verify() From 15b03cd08e9a868b4a94b124c999c7bf1679efde Mon Sep 17 00:00:00 2001 From: atc0m Date: Mon, 9 Sep 2019 11:41:04 +0100 Subject: [PATCH 27/58] single quote header --- email_reply_parser/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index ed50e0e..5ff0fa3 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -92,9 +92,9 @@ def en_support(self): def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') - self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello(.+?))?:$)') + self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello.+?)?:)') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?![a-zA-Z0-9.:;<>&@ ]+?kirjoitti(.+?)kirjoitti[a-zA-Z0-9.:;<>&@ ]*?:$)([a-zA-Z0-9.:;<>&@ ]+?kirjoitti[a-zA-Z0-9.:;<>&@ ]*?:$)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ ]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ ]*:$)' def set_regex(self): if hasattr(self, self.language+"_support"): @@ -116,7 +116,6 @@ def read(self): """ self.found_visible = False - is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) From ee0313954dbc061080f4cb13a1bdef63d95283fd Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 26 Sep 2019 16:52:57 +0100 Subject: [PATCH 28/58] whitespace --- email_reply_parser/__init__.py | 4 ++-- test/performance.py | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 5ff0fa3..e27d285 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -86,7 +86,7 @@ def fr_support(self): def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' @@ -129,7 +129,7 @@ def read(self): for line in self.lines: if line.strip(): - self._scan_line(line) + self._scan_line(line.strip()) self._finish_fragment() diff --git a/test/performance.py b/test/performance.py index 424574e..108ddfa 100644 --- a/test/performance.py +++ b/test/performance.py @@ -1,4 +1,4 @@ -# import pandas as pd +import pandas as pd # import numpy as np import json import time @@ -6,7 +6,7 @@ from email_reply_parser import EmailReplyParser def profile(): - df = pd.DataFrame.from_csv('test.csv') + df = pd.read_csv('test.csv') ground = time.time() content = df.content.values[np.argmax([len(d) for d in df.content.values])] start = time.time() @@ -34,5 +34,20 @@ def verify(): parsed.append(text) print(text) +def parse_df(): + parser = EmailReplyParser(language='en') + path = 'test/emails/zipwrotetest.csv' + df = pd.read_csv(path) + parsed = [] + for text in df.sentence.values: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + df = df.assign(clean=parsed) + df.to_csv(path) + import code + code.interact(local=locals()) + if __name__ == '__main__': - verify() + parse_df() From 4360c6c2f589d70325ac1a6869b13218ed9de634 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 14 Nov 2019 17:10:34 +0000 Subject: [PATCH 29/58] omit sigs, fix header regex --- email_reply_parser/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index e27d285..47ee051 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -57,7 +57,7 @@ def __init__(self, text, language, words_map): def default_quoted_header(self): self.QUOTED_REGEX = re.compile(r'(>+)') self.HEADER_REGEX = re.compile( - r'^\[* ]?(' + self.words_map[self.language]['From'] + + r'^[* ]?(' + self.words_map[self.language]['From'] + '|' + self.words_map[self.language]['Sent'] + '|' + self.words_map[self.language]['To'] + '|' + self.words_map[self.language]['Subject'] + @@ -143,7 +143,7 @@ def reply(self): """ reply = [] for f in self.fragments: - if not (f.hidden or f.quoted): + if not (f.hidden or f.quoted or f.signature): reply.append(f.content) return '\n'.join(reply) From 9b8bf601dba81b492c64ffa6a946b196ce364d19 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 14 Nov 2019 19:51:46 +0000 Subject: [PATCH 30/58] signature appended to hidden fragment --- email_reply_parser/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 47ee051..5b52f3d 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -85,7 +85,7 @@ def fr_support(self): self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit.*>:)(Le\s(.+?)a écrit.*>:)' def en_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote:$') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' @@ -154,15 +154,12 @@ def _scan_line(self, line): is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None - if self.fragment: - if self.SIG_REGEX.match(line.strip()): + if self.fragment and self.SIG_REGEX.match(line.strip()): self.fragment.signature = True - self._finish_fragment() - - if self.fragment \ + self.fragment.lines.append(line) + elif self.fragment \ and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): - self.fragment.lines.append(line) else: self._finish_fragment() From e52ba6d7b955c8ba89d2b95b8f08adfdffc27cea Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 15 Nov 2019 16:18:19 +0000 Subject: [PATCH 31/58] finish fragment after signature --- email_reply_parser/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 5b52f3d..79d7c3c 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -132,7 +132,6 @@ def read(self): self._scan_line(line.strip()) self._finish_fragment() - self.fragments.reverse() return self @@ -155,8 +154,9 @@ def _scan_line(self, line): is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None if self.fragment and self.SIG_REGEX.match(line.strip()): - self.fragment.signature = True - self.fragment.lines.append(line) + self.fragment.signature = True + self.fragment.lines.append(line) + self._finish_fragment() elif self.fragment \ and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): From 915f2c0ffc4fc3954b72f28a7c6a10be4c9caec9 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 23 Jan 2020 15:27:39 +0000 Subject: [PATCH 32/58] test txts --- test/performance.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/test/performance.py b/test/performance.py index 108ddfa..ac7825d 100644 --- a/test/performance.py +++ b/test/performance.py @@ -49,5 +49,26 @@ def parse_df(): import code code.interact(local=locals()) +def parse_json(): + parser = EmailReplyParser(language='en') + with open('english.json', 'rb') as fl: + messages = json.load(fl) + parsed = [] + for text in messages: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + import code + code.interact(local=locals()) + +def parse_text(): + parser = EmailReplyParser(language='en') + with open('test/emails/email_1_6.txt', 'r') as fl: + message = fl.read() + text = parser.parse_reply(message) + print(text) + if __name__ == '__main__': - parse_df() + parse_json() + # parse_text() From 2851339da519cc440a00520d4b2bf7ac3949a759 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 23 Jan 2020 15:29:07 +0000 Subject: [PATCH 33/58] ignore json --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1e9728d..afc0a75 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,4 @@ dist/ dist/* *.csv __pycache__/ -customer_emails.json +*.json From 6347b2b4df4601b3a8617a34202fc62d157d3553 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 23 Jan 2020 18:54:07 +0000 Subject: [PATCH 34/58] warnings regex --- email_reply_parser/__init__.py | 5 ++++ test/emails/caution.txt | 50 ++++++++++++++++++++++++++++++++++ test/performance.py | 4 +-- 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 test/emails/caution.txt diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 79d7c3c..93e785a 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -64,6 +64,9 @@ def default_quoted_header(self): ')\s*:\*? .+|.+(mailto:).+' ) + def warnings(self): + self.WARNING_REGEX = re.compile(r'CAUTION: [a-zA-Z0-9.,?!\' ]*') + def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') @@ -106,6 +109,7 @@ def set_regex(self): self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ ':)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' + self.warnings() self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) @@ -123,6 +127,7 @@ def read(self): # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) + self.text = re.sub(self.WARNING_REGEX, '\n', self.text) self.lines = self.text.split('\n') self.lines.reverse() diff --git a/test/emails/caution.txt b/test/emails/caution.txt new file mode 100644 index 0000000..67c69bb --- /dev/null +++ b/test/emails/caution.txt @@ -0,0 +1,50 @@ +CAUTION: This email originated from outside of this company. Do not click links or open attachments unless you recognize the sender and know the content is safe. + + + +Hi lads's Team, + + +Below THIS and THAT file for 31st August 2019 is still not available on "lads.lads.com" could you please check and advise us ASAP. + +pfg.Zip +pfg02.Zip + + +part01_07.Zip +_part02_07.Zip +_part03_07.Zip +_part04_07.Zip +_part05_07.Zip +_part06_07.Zip +_part07_07.Zip + + +job at our end. + + +Thanks, +___________________ +HAHA LOLO +Markets Application Production Services - Reference Data +ROLE +Bank of LADS - LADS LADS +BUILDING 5B,HAHA - THIS THAT, COUNTRY +Direct: (+00)00-000-0000 Mobile: (+00)000000 +______________________________________________________________________________ + +To report an issue or request for technical assistance with Product Reference Data applications, please send email to LADS SUPPORT. This is the only OO being monitored by the Product Reference Data support team. No other DGs or Mailboxes are being actively monitored. +Please make a note of this to avoid any delays. + +Escalation: LADS MANAGEMENT +___________________________________________________________ + +---------------------------------------------------------------------- +This message w/attachments (message) is intended solely for the use of the intended recipient(s) and may contain information that is privileged, confidential or proprietary. If you are not an intended recipient, please notify the sender, and then please delete and destroy all copies and attachments, and be advised that any review or dissemination of, or the taking of any action in reliance on, the information contained in or attached to this message is prohibited. +Unless specifically indicated, this message is not an offer to sell or a solicitation of any investment products or other financial product or service, an official confirmation of any transaction, or an official statement of Sender. Subject to applicable law, Sender may intercept, monitor, review and retain e-communications (EC) traveling through its networks/systems and may produce any such EC to regulators, law enforcement, in litigation and as required by law. +The laws of the country of each sender/recipient may impact the handling of EC, and EC may be archived, supervised and produced in countries other than the country in which you are located. This message cannot be guaranteed to be secure or free of errors or viruses. Attachments that are part of this EC may have additional important disclosures and disclaimers, which you should read. By messaging with Sender you consent to the foregoing. +----------------------------------------- + +lads's monitors email communications through its networks for regulatory compliance purposes and to protect its customers, employees and business and where allowed to do so by applicable law. The information contained in this e-mail message, and any attachment thereto, is confidential and may not be disclosed without our express permission. If you are not the intended recipient or an employee or agent responsible for delivering this message to the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution or copying of this message, or any attachment thereto, in whole or in part, is strictly prohibited. If you have received this message in error, please immediately notify us by telephone, fax or e-mail and delete the message and all of its attachments. Every effort is made to keep our network free from viruses. You should, however, review this e-mail message, as well as any attachment thereto, for viruses. We take no responsibility and have no liability for any computer virus which may be transferred via this e-mail message. + +----------------------------------------- diff --git a/test/performance.py b/test/performance.py index ac7825d..127ea72 100644 --- a/test/performance.py +++ b/test/performance.py @@ -64,11 +64,11 @@ def parse_json(): def parse_text(): parser = EmailReplyParser(language='en') - with open('test/emails/email_1_6.txt', 'r') as fl: + with open('test/emails/caution.txt', 'r') as fl: message = fl.read() text = parser.parse_reply(message) print(text) if __name__ == '__main__': - parse_json() + parse_text() # parse_text() From 667192d1d8a883e608071fecbe324d5c2f5b79b5 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 23 Jan 2020 18:58:33 +0000 Subject: [PATCH 35/58] warnings --- email_reply_parser/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 93e785a..fa713ff 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -127,6 +127,7 @@ def read(self): # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) + self.text = re.sub(self.WARNING_REGEX, '\n', self.text) self.lines = self.text.split('\n') From b68075fa4a83635c86b8f0c17eb88c5ed47ae468 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 29 Jan 2020 02:51:52 +0000 Subject: [PATCH 36/58] notice, do not reply --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index fa713ff..4157515 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'CAUTION: [a-zA-Z0-9.,?!\' ]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply) [a-zA-Z0-9.,?!-()@\' ]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From e25be2235df246a7d1541ad1b7844b902c388b0f Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 30 Jan 2020 12:32:58 +0000 Subject: [PATCH 37/58] more warnings --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 4157515..e13c1a8 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply) [a-zA-Z0-9.,?!-()@\' ]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail) [a-zA-Z0-9.,?!-()@\' ]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From 27a92f2113bab38efb682ae388b8d7f7d5ad3159 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 30 Jan 2020 17:11:21 +0000 Subject: [PATCH 38/58] character syntax fix --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index e13c1a8..8c19a22 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail) [a-zA-Z0-9.,?!-()@\' ]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail) [a-zA-Z0-9.,?!()@/\' \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From a746d245613c009aa4a8805b50a41726fee15b6d Mon Sep 17 00:00:00 2001 From: atc0m Date: Tue, 18 Feb 2020 17:34:13 +0000 Subject: [PATCH 39/58] warnings extended --- email_reply_parser/__init__.py | 2 +- test/emails/caution2.txt | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 test/emails/caution2.txt diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 8c19a22..42ae850 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail) [a-zA-Z0-9.,?!()@/\' \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned) [a-zA-Z0-9:;.,?!()@/\' \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') diff --git a/test/emails/caution2.txt b/test/emails/caution2.txt new file mode 100644 index 0000000..8624943 --- /dev/null +++ b/test/emails/caution2.txt @@ -0,0 +1,12 @@ +Hello, + +I am trying to place an order and it keeps tell me my order cannot be processed at this time. I tried using two different consultants and it still won't work. I am not sure if it's a technical problem. My order is time sensitive. Thank you for your help! + +Person Person. + + +Disclaimer + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by Thing Ltd, an innovator in Software as a Service (SaaS) for business. Providing a safer and more useful place for your human generated data. Specializing in; Security, archiving and compliance. To find out more Click Here (http://www.thisthat.com/things/) . From 7f742e6f5919b339ed4c207ccb1bda2844e2efaa Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 03:01:00 +0000 Subject: [PATCH 40/58] extend --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 42ae850..98ae414 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned) [a-zA-Z0-9:;.,?!()@/\' \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From ceb175d21be5806f0474f85c7d7ee1d580a73449 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 04:19:40 +0000 Subject: [PATCH 41/58] confidential --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 98ae414..6f7f9d3 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From 36ccc986400e196f6dfa7c7b37985ff67b26512e Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 05:28:01 +0000 Subject: [PATCH 42/58] communication --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 98ae414..692c39b 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From b7da5c813a2069a31ed45681254712ef3bed1500 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 05:32:04 +0000 Subject: [PATCH 43/58] extra quote --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 692c39b..c749007 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\" \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\"“ \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') From 0017f9fb757d15b6c771ef90d71d36b009a5493d Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 05:40:41 +0000 Subject: [PATCH 44/58] more quote signs --- email_reply_parser/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index c749007..a9c3993 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,7 +65,7 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\"“ \-]*') + self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\"\“\” \-]*') def nl_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') @@ -123,13 +123,13 @@ def read(self): is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) - + import code + code.interact(local=locals()) # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) self.text = re.sub(self.WARNING_REGEX, '\n', self.text) - self.lines = self.text.split('\n') self.lines.reverse() From dd51d4ec9b093ec63841ba84a0bf3fdab6c05018 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 5 Mar 2020 05:41:54 +0000 Subject: [PATCH 45/58] rm stop --- email_reply_parser/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index a9c3993..5cfe97d 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -123,8 +123,6 @@ def read(self): is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) - import code - code.interact(local=locals()) # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) From be17320bbfd066862fb008d7cd5bc5c707ccaa39 Mon Sep 17 00:00:00 2001 From: atc0m Date: Tue, 31 Mar 2020 21:19:09 +0100 Subject: [PATCH 46/58] chinese update --- email_reply_parser/__init__.py | 2 +- email_reply_parser/languages_support.json | 6 +++--- test/emails/chinese.txt | 22 ++++++++++++++++++++++ test/emails/chinese2.txt | 18 ++++++++++++++++++ 4 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 test/emails/chinese.txt create mode 100644 test/emails/chinese2.txt diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 5cfe97d..1eccb76 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -108,7 +108,7 @@ def set_regex(self): self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ - ':)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' self.warnings() self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) diff --git a/email_reply_parser/languages_support.json b/email_reply_parser/languages_support.json index 11fbaf4..24c0237 100644 --- a/email_reply_parser/languages_support.json +++ b/email_reply_parser/languages_support.json @@ -129,9 +129,9 @@ }, "zh": { "Sent from": "\u6765\u81ea", - "From": "\u4ece", - "To": "\u81f3", - "wrote": "\u5199", + "From": "\u4ece|\u53d1\u4ef6\u4eba", + "To": "\u81f3|\u6536\u4ef6\u4eba", + "wrote": "\u5199|\u5199\u9053", "Sent": "\u53d1\u9001", "Subject": "\u5b66\u79d1" }, diff --git a/test/emails/chinese.txt b/test/emails/chinese.txt new file mode 100644 index 0000000..340b62e --- /dev/null +++ b/test/emails/chinese.txt @@ -0,0 +1,22 @@ +Hi, I want to cancel my order, could you recall the package and return it? Thank you + +发件人: ""xyz.com"" +日期: 2020年3月6日 星期五 上午1:04 +收件人: ""zys@hotmail.com"" +主题: xyz.com - Order <111111111> has been shipped + +Shipping Confirmation + +Dear XYZ ZUS, + +Thank you for shopping at ZYS! + +Your order has been shipped. Your shipment details are shown below. + +Please note that it may take up to 24 hours for the shipping carrier to update the tracking information. + +If you have any questions you can contact us at + +ORDER + +SHIPPING diff --git a/test/emails/chinese2.txt b/test/emails/chinese2.txt new file mode 100644 index 0000000..fe6efa6 --- /dev/null +++ b/test/emails/chinese2.txt @@ -0,0 +1,18 @@ +Dear bbb +This is X. I just brought one sneaker and two slippers . Can I cancel my order please thanks + +XYZ Customer Care 于2020年3月9日 周一下午9:10写道: + +Thank you for shopping at SSENSE. Please allow us up to two business days to process your order. You’ll find a copy of your receipt and order information. + +MY ACCOUNT () CUSTOMER SUPPORT () + +Order Confirmation + +Dear XYZ XYZ, + +Thank you for shopping at ZYZ. Please allow us up to two business days to process your order. Once it’s been processed, you’ll receive a shipment confirmation email with your order’s tracking number. + +Below, you’ll find a copy of your receipt and order information. Please keep it for your records. + +\ From 696b6d1ca56a8979f0a63794e058f399008ab510 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 2 Apr 2020 17:11:34 +0100 Subject: [PATCH 47/58] french extension --- email_reply_parser/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 1eccb76..d2447c3 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -84,14 +84,14 @@ def fr_support(self): 'bonne journ[ée]e))', re.IGNORECASE ) - self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*>:$') - self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit.*>:)(Le\s(.+?)a écrit.*>:)' + self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ ]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ ]*:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') - self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote:$') + self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') @@ -122,6 +122,8 @@ def read(self): self.found_visible = False is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: + import code + code.interact(local=locals()) self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example From a0cbc45acfe13a391a5db6b0d1391cc54dc0e942 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 2 Apr 2020 17:13:19 +0100 Subject: [PATCH 48/58] french extension --- email_reply_parser/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index d2447c3..5479fb3 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -122,8 +122,6 @@ def read(self): self.found_visible = False is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: - import code - code.interact(local=locals()) self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example From 62a81cab5930bf12b192d32082a149d0bfa2d5e9 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 3 Apr 2020 13:13:29 +0100 Subject: [PATCH 49/58] japanese quoted header support --- email_reply_parser/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 5479fb3..78cc51e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -85,7 +85,7 @@ def fr_support(self): re.IGNORECASE ) self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$') - self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ ]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ ]*:)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ -]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ -]*:)' def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') @@ -93,11 +93,17 @@ def en_support(self): self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' + def ja_support(self): + self.SIG_REGEX = re.compile(r'--|__|-\w') + self.QUOTE_HDR_REGEX = re.compile(r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' # Dummy multiline: doesnt work for japanese due to BeautifulSoup insreting new lines before ":" character + def fi_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello.+?)?:)') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') - self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ ]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ ]*:$)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)' def set_regex(self): if hasattr(self, self.language+"_support"): @@ -105,7 +111,7 @@ def set_regex(self): self.default_quoted_header() else: self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + '|' + self.words_map[self.default_language]['Sent from'] + ')(\w+\s*){1,3})') - self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$') + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$') self.default_quoted_header() self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' From f8c51af6d189b21eb4a8aac9e5482f8ab31bdbd5 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 3 Apr 2020 16:12:32 +0100 Subject: [PATCH 50/58] chinese sent from --- email_reply_parser/languages_support.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/languages_support.json b/email_reply_parser/languages_support.json index 24c0237..4f3962c 100644 --- a/email_reply_parser/languages_support.json +++ b/email_reply_parser/languages_support.json @@ -128,7 +128,7 @@ "Subject": "Aihe" }, "zh": { - "Sent from": "\u6765\u81ea", + "Sent from": "\u83b7\u53d6", "From": "\u4ece|\u53d1\u4ef6\u4eba", "To": "\u81f3|\u6536\u4ef6\u4eba", "wrote": "\u5199|\u5199\u9053", From 55ebfbc0572c03911caeaf8b6ef73eeb4ed4fe53 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 11 Jun 2020 04:44:42 +0100 Subject: [PATCH 51/58] add follow up to quote --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 78cc51e..0cfb6db 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -89,7 +89,7 @@ def fr_support(self): def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') - self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$') + self.QUOTE_HDR_REGEX = re.compile('\s*(On.*wrote\s*:|This is a follow-up to your previous request.*)$') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' From f24576be1cb01897b8905dd3d93960dbcfd0fc34 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 19 Jun 2020 20:05:02 +0100 Subject: [PATCH 52/58] forward/multi header test --- test/emails/forward.txt | 6 ++++++ test/emails/multi_header.txt | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 test/emails/forward.txt create mode 100644 test/emails/multi_header.txt diff --git a/test/emails/forward.txt b/test/emails/forward.txt new file mode 100644 index 0000000..a715ce3 --- /dev/null +++ b/test/emails/forward.txt @@ -0,0 +1,6 @@ +FW: YYY Arrival Notice XYZ - YYY ELA/XYZ ETA: 2020-06-08 +This is a follow-up to your previous request #12345 "RE: XYZ and Manifest amendm..." +Hello team, +Can I get 1 Arrival Notice without PU# and invoice? +Thank you. +Best regards diff --git a/test/emails/multi_header.txt b/test/emails/multi_header.txt new file mode 100644 index 0000000..8eeded0 --- /dev/null +++ b/test/emails/multi_header.txt @@ -0,0 +1,41 @@ +No problem. I’ll just start a new order. + +On May 30, 2020, at 4:24 PM, XYZ wrote: + + +Hi XYZ, + +Unfortunately, we are unable to add items to your order, but if you would like we can cancel your order and issue a full refund so that you may order again with your preferred selection of pastries. This may result in a later delivery date, but please let us know if you would like us to cancel your order and we will set that up for you. + + + +On May 30, 2020, 4:15 PM XYZ xyz@xyz.com wrote: + +No worries and thank you. I wanted to add a couple of new items to the same shipment. Would that Be possible or should I just order thru the website? + +On May 30, 2020, at 3:54 PM, XYZ wrote: + + +Hi XYZ, + +We apologize for the incorrect product and for any inconvenience this may have caused. + +We have placed a replacement order of 1 X for delivery on June 4, 2020. You will be receiving an email confirmation for this new order and your tracking number will be emailed 1-2 nights before the delivery date. + +Thank you for your patience! + + + +On May 30, 2020, 10:01 AM XYZ xyz@xyz.com wrote: + +Hello XYZ, + +Thank you for emailing the Team! + +We are writing to you to confirm that we have received your email. + +We apologize for any inconvenience and assure you that we will find a solution for any question, concern, or comment you may have. + +We appreciate your patience during these times. + +-Team From 4ec9dd75d5d7efba3f51b44f2d4ebfd73db10c72 Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 23 Oct 2020 02:46:50 +0200 Subject: [PATCH 53/58] confidentiality notice variations --- email_reply_parser/__init__.py | 35 +++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 0cfb6db..3bab715 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -65,10 +65,21 @@ def default_quoted_header(self): ) def warnings(self): - self.WARNING_REGEX = re.compile(r'(CAUTION:|Confidentiality Notice:|Please do not reply|This electronic mail|The information contained|This email has been scanned|This message and any associated files|This message is for the recipients|The [cC]ontents are confidential|This communication with its contents) [a-zA-Z0-9:;.,?!()@/\'\"\“\” \-]*') + self.WARNING_REGEX = re.compile( + r'(CAUTION:|NOTICE:|Confidentiality Notice:|Please do not reply|This electronic mail' + r'|Disclaimer: This message is intended' + r'|The information contained|This email has been scanned|This message and any associated files' + r'|This email and any files transmitted|This message is for the recipients' + r'|The information provided within this communication' + r'|This message (including any attachments) is intended' + r'|The [cC]ontents are confidential|This communication with its contents' + r'|Please consider the environment before printing this email) [a-zA-Z0-9:;.,?!()@&/\'\"\“\” \-]*' + ) def nl_support(self): - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})' + ) self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' @@ -79,9 +90,9 @@ def de_support(self): def fr_support(self): self.SIG_REGEX = re.compile( - r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ - + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations|cdlt|cdt|crdt|regards|best regard|' - 'bonne journ[ée]e))', + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations' + r'|cdlt|cdt|crdt|regards|best regard|bonne journ[ée]e))', re.IGNORECASE ) self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$') @@ -95,7 +106,9 @@ def en_support(self): def ja_support(self): self.SIG_REGEX = re.compile(r'--|__|-\w') - self.QUOTE_HDR_REGEX = re.compile(r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$') + self.QUOTE_HDR_REGEX = re.compile( + r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$' + ) self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' # Dummy multiline: doesnt work for japanese due to BeautifulSoup insreting new lines before ":" character @@ -110,11 +123,15 @@ def set_regex(self): getattr(self, self.language+"_support")() self.default_quoted_header() else: - self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + '|' + self.words_map[self.default_language]['Sent from'] + ')(\w+\s*){1,3})') + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + + '|' + self.words_map[self.default_language]['Sent from'] + + ')(\w+\s*){1,3})' + ) self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$') self.default_quoted_header() - self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \ - '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' self.warnings() self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) From 95365d3a7f955fd735f3ac93f7aeeade143a2b5d Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 23 Oct 2020 03:41:26 +0200 Subject: [PATCH 54/58] styling --- email_reply_parser/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 3bab715..6b08e04 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -57,11 +57,11 @@ def __init__(self, text, language, words_map): def default_quoted_header(self): self.QUOTED_REGEX = re.compile(r'(>+)') self.HEADER_REGEX = re.compile( - r'^[* ]?(' + self.words_map[self.language]['From'] + - '|' + self.words_map[self.language]['Sent'] + - '|' + self.words_map[self.language]['To'] + - '|' + self.words_map[self.language]['Subject'] + - ')\s*:\*? .+|.+(mailto:).+' + r'^[* ]?(' + self.words_map[self.language]['From'] \ + + '|' + self.words_map[self.language]['Sent'] \ + + '|' + self.words_map[self.language]['To'] \ + + '|' + self.words_map[self.language]['Subject'] \ + + ')\s*:\*? .+|.+(mailto:).+' ) def warnings(self): @@ -90,7 +90,7 @@ def de_support(self): def fr_support(self): self.SIG_REGEX = re.compile( - r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations' r'|cdlt|cdt|crdt|regards|best regard|bonne journ[ée]e))', re.IGNORECASE @@ -124,13 +124,13 @@ def set_regex(self): self.default_quoted_header() else: self.SIG_REGEX = re.compile( - r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] - + '|' + self.words_map[self.default_language]['Sent from'] + r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] \ + + '|' + self.words_map[self.default_language]['Sent from'] \ + ')(\w+\s*){1,3})' ) self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$') self.default_quoted_header() - self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] \ + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' self.warnings() self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) From 823534360f9fb9c16366d1809ffa536bd770d2af Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 23 Oct 2020 13:34:35 +0200 Subject: [PATCH 55/58] only replace follow up if it's not the first line --- email_reply_parser/__init__.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 6b08e04..dbda633 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -100,7 +100,7 @@ def fr_support(self): def en_support(self): self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})') - self.QUOTE_HDR_REGEX = re.compile('\s*(On.*wrote\s*:|This is a follow-up to your previous request.*)$') + self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$') self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' @@ -133,6 +133,7 @@ def set_regex(self): self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] \ + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' self.warnings() + self.FOLLOW_UP_HDR_REGEX = re.compile(r'(? Date: Fri, 23 Oct 2020 13:37:54 +0200 Subject: [PATCH 56/58] strip whitespace --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index dbda633..2481baa 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -142,7 +142,7 @@ def read(self): and labels as a signature, quote, or hidden. Returns EmailMessage instance """ - + self.text = self.text.strip() self.found_visible = False is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: From 2d4beb10c6f127ddfa5556c78552583ff9ccb18a Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 30 Dec 2020 16:15:14 +0100 Subject: [PATCH 57/58] webform subject fix --- email_reply_parser/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 2481baa..0029380 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -59,8 +59,7 @@ def default_quoted_header(self): self.HEADER_REGEX = re.compile( r'^[* ]?(' + self.words_map[self.language]['From'] \ + '|' + self.words_map[self.language]['Sent'] \ - + '|' + self.words_map[self.language]['To'] \ - + '|' + self.words_map[self.language]['Subject'] \ + + '|' + self.words_map[self.language]['To'] + ')\s*:\*? .+|.+(mailto:).+' ) @@ -68,6 +67,8 @@ def warnings(self): self.WARNING_REGEX = re.compile( r'(CAUTION:|NOTICE:|Confidentiality Notice:|Please do not reply|This electronic mail' r'|Disclaimer: This message is intended' + r'|This message and any attachments are solely' + r'|This email contains privileged information' r'|The information contained|This email has been scanned|This message and any associated files' r'|This email and any files transmitted|This message is for the recipients' r'|The information provided within this communication' From aef3fe76a72e4fe7107786a07e00d0b12687ab01 Mon Sep 17 00:00:00 2001 From: mohamedalani Date: Mon, 8 Mar 2021 18:14:04 +0100 Subject: [PATCH 58/58] added spanish stuff --- email_reply_parser/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 0029380..c587c58 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -105,6 +105,11 @@ def en_support(self): self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' + def es_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Enviado desde (\w+\s*){1,6})') + self.QUOTE_HDR_REGEX = re.compile('\s*El.*escribió\s*:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!El.*El\s.+?escribió\s*:)(El\s(.+?)escribió\s*:)' + def ja_support(self): self.SIG_REGEX = re.compile(r'--|__|-\w') self.QUOTE_HDR_REGEX = re.compile(