From b919419a12931f3ed46651e2910b2cf8c9bcce55 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 21 Feb 2019 12:42:47 +0000 Subject: [PATCH 1/7] language support --- email_reply_parser/__init__.py | 85 +++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 063f65b..cb0f1bf 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,54 +1,105 @@ """ email_reply_parser is a python library port of GitHub's Email Reply Parser. - For more information, visit https://github.com/zapier/email-reply-parser + For more information, visit https://github.com/zapier/email_reply_parser """ import re +import json class EmailReplyParser(object): """ Represents a email message that is parsed. """ + def __init__(self, language='en'): + self.language = language - @staticmethod - def read(text): + def read(self, text): """ Factory method that splits email into list of fragments text - A string email body Returns an EmailMessage instance """ - return EmailMessage(text).read() + return EmailMessage(text, self.language).read() - @staticmethod - def parse_reply(text): + def parse_reply(self, text): """ Provides the reply portion of email. text - A string email body Returns reply body message """ - return EmailReplyParser.read(text).reply + return self.read(text).reply class EmailMessage(object): """ An email message represents a parsed email body. """ - - SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - QUOTE_HDR_REGEX = re.compile('On.*wrote:$') - QUOTED_REGEX = re.compile(r'(>+)') - HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') - _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' - MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) - MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) - - def __init__(self, text): + def __init__(self, text, language): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') self.found_visible = False + self.SIG_REGEX = None + self.QUOTE_HDR_REGEX = None + self.QUOTED_REGEX = None + self.HEADER_REGEX = None + self._MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None + with open("languages_support.json", "r") as read_file: + self.words_diff_languages = json.load(read_file) + self.language = language + self.set_regex() + + def default_quoted_header(self): + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^\*?(' + self.words_diff_languages[self.language]['From'] + + '|' + self.words_diff_languages[self.language]['Sent'] + + '|' + self.words_diff_languages[self.language]['To'] + + '|' + self.words_diff_languages[self.language]['Subject'] + + '):\*? .+' + ) + + def nl_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' + + def de_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$') + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^\*?(' + self.words_diff_languages[self.language]['From'] + + '|' + self.words_diff_languages[self.language]['Sent'] + + '|' + self.words_diff_languages[self.language]['To'] + + '|' + self.words_diff_languages[self.language]['Subject'] + + '):\*? .+' + ) + self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)' + + def en_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + + def set_regex(self): + if hasattr(self, self.language+"_support"): + getattr(self, self.language+"_support")() + else: + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \ + ':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)' + self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL) def read(self): """ Creates new fragment for each line From 2517a4166f4de5d01a4e193dc558174c6b42fc26 Mon Sep 17 00:00:00 2001 From: atc0m Date: Thu, 21 Feb 2019 15:07:49 +0000 Subject: [PATCH 2/7] json conf --- support/languages_support.json | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 support/languages_support.json diff --git a/support/languages_support.json b/support/languages_support.json new file mode 100644 index 0000000..aeedb2a --- /dev/null +++ b/support/languages_support.json @@ -0,0 +1,162 @@ +{ + "vi": { + "Sent from": "\u0110\u01b0\u1ee3c g\u1eedi t\u1eeb", + "From": "T\u1eeb", + "To": "\u0110\u1ebfn", + "wrote": "\u0111\u00e3 vi\u1ebft", + "Sent": "G\u1edfi", + "Subject": "M\u00f4n h\u1ecdc" + }, + "ru": { + "Sent from": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e \u0438\u0437", + "From": "\u041e\u0442", + "To": "\u043a", + "wrote": "\u043f\u0438\u0441\u0430\u043b", + "Sent": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e", + "Subject": "\u041f\u0440\u0435\u0434\u043c\u0435\u0442" + }, + "fr": { + "Sent from": "Envoy\u00e9 depuis", + "From": "De", + "To": "\u00c0", + "wrote": "a \u00e9crit", + "Sent": "Envoy\u00e9", + "Subject": "Objet" + }, + "en": { + "Sent from": "Sent from", + "From": "From", + "To": "To", + "wrote": "wrote", + "Sent": "Sent", + "Subject": "Subject" + }, + "nl": { + "Sent from": "Verzonden met", + "From": "Van", + "To": "Aan", + "wrote": "schreef", + "Sent": "Verzonden", + "Subject": "Onderwerp" + }, + "pt": { + "Sent from": "Enviado de", + "From": "De", + "To": "Para", + "wrote": "escrevi", + "Sent": "Enviei", + "Subject": "Sujeito" + }, + "ko": { + "Sent from": "\ubd80\ud130 \ubcf4\ub0b4\uc9c4", + "From": "\uc5d0\uc11c", + "To": "\uc5d0", + "wrote": "\uc4f4", + "Sent": "\uc804\uc1a1 \ub428", + "Subject": "\uc81c\ubaa9" + }, + "de": { + "Sent from": "Gesendet von", + "From": "Von", + "To": "An", + "wrote": "schrieb", + "Sent": "geschickt", + "Subject": "Betreff" + }, + "tr": { + "Sent from": "Den g\u00f6nderildi", + "From": "itibaren", + "To": "i\u00e7in", + "wrote": "yazd\u0131", + "Sent": "G\u00f6nderilen", + "Subject": "konu" + }, + "it": { + "Sent from": "Inviato da", + "From": "Da", + "To": "A", + "wrote": "ha scritto", + "Sent": "Inviato", + "Subject": "Oggetto" + }, + "id": { + "Sent from": "Dikirim dari", + "From": "Dari", + "To": "Untuk", + "wrote": "menulis", + "Sent": "Terkirim", + "Subject": "Subyek" + }, + "sk": { + "Sent from": "Odoslan\u00e9 od", + "From": "z", + "To": "na", + "wrote": "nap\u00edsal", + "Sent": "odoslan\u00e9", + "Subject": "predmet" + }, + "ar": { + "Sent from": "\u0627\u0631\u0633\u0644\u062a \u0645\u0646", + "From": "\u0645\u0646 \u0639\u0646\u062f", + "To": "\u0625\u0644\u0649", + "wrote": "\u0643\u062a\u0628", + "Sent": "\u0623\u0631\u0633\u0644\u062a", + "Subject": "\u0645\u0648\u0636\u0648\u0639" + }, + "es": { + "Sent from": "Enviado desde", + "From": "De", + "To": "Para", + "wrote": "escribi\u00f3", + "Sent": "Expedido", + "Subject": "Asunto" + }, + "th": { + "Sent from": "\u0e2a\u0e48\u0e07\u0e08\u0e32\u0e01", + "From": "\u0e08\u0e32\u0e01", + "To": "\u0e44\u0e1b\u0e22\u0e31\u0e07", + "wrote": "\u0e40\u0e02\u0e35\u0e22\u0e19", + "Sent": "\u0e2a\u0e48\u0e07", + "Subject": "\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07" + }, + "fi": { + "Sent from": "L\u00e4hetetty", + "From": "alkaen", + "To": "jotta", + "wrote": "kirjoitti", + "Sent": "L\u00e4hetetyt", + "Subject": "aihe" + }, + "zh": { + "Sent from": "\u6765\u81ea", + "From": "\u4ece", + "To": "\u81f3", + "wrote": "\u5199", + "Sent": "\u53d1\u9001", + "Subject": "\u5b66\u79d1" + }, + "ja": { + "Sent from": "\u9001\u4fe1\u5143", + "From": "\u304b\u3089", + "To": "\u306b", + "wrote": "\u66f8\u304d\u307e\u3057\u305f", + "Sent": "\u9001\u4fe1\u6e08\u307f", + "Subject": "\u4ef6\u540d" + }, + "pl": { + "Sent from": "Wys\u0142ane z", + "From": "Z", + "To": "Do", + "wrote": "napisa\u0142", + "Sent": "Wys\u0142ane", + "Subject": "Przedmiot" + }, + "he": { + "Sent from": "\u05e0\u05e9\u05dc\u05d7 \u05de", + "From": "\u05de", + "To": "\u05dc", + "wrote": "\u05db\u05ea\u05d1\u05ea\u05d9", + "Sent": "\u05e0\u05e9\u05dc\u05d7", + "Subject": "\u05e0\u05d5\u05e9\u05d0" + } +} \ No newline at end of file From da6ea80619004f514d7b764260b34a4f875a9880 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:19:17 +0000 Subject: [PATCH 3/7] languages support path --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index cb0f1bf..745c5f4 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,7 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("languages_support.json", "r") as read_file: + with open("../support/languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() From 38da7751d454781e5ef220b58a7878fe53f89209 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:26:14 +0000 Subject: [PATCH 4/7] move json into parser module --- email_reply_parser/__init__.py | 2 +- {support => email_reply_parser}/languages_support.json | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {support => email_reply_parser}/languages_support.json (100%) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 745c5f4..cb0f1bf 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,7 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("../support/languages_support.json", "r") as read_file: + with open("languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() diff --git a/support/languages_support.json b/email_reply_parser/languages_support.json similarity index 100% rename from support/languages_support.json rename to email_reply_parser/languages_support.json From 929e78ccf6a306dd6035dc96245948da06853c20 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:30:19 +0000 Subject: [PATCH 5/7] dir_path --- email_reply_parser/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index cb0f1bf..a15a31e 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -48,7 +48,8 @@ def __init__(self, text, language): self._MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX = None self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None - with open("languages_support.json", "r") as read_file: + dir_path = os.path.dirname(__file__) + with open(dir_path + "/languages_support.json", "r") as read_file: self.words_diff_languages = json.load(read_file) self.language = language self.set_regex() From aa5b02bdbd0365c714f0364c9c37aae7b2bd3d94 Mon Sep 17 00:00:00 2001 From: atc0m Date: Wed, 6 Mar 2019 12:31:39 +0000 Subject: [PATCH 6/7] os --- email_reply_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index a15a31e..9e50417 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -3,7 +3,7 @@ For more information, visit https://github.com/zapier/email_reply_parser """ - +import os import re import json From 7fdecf3227897fa1c665c2581cece17c14c832ba Mon Sep 17 00:00:00 2001 From: atc0m Date: Fri, 29 Mar 2019 11:32:19 +0000 Subject: [PATCH 7/7] build with language support --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5d3078e..6f25115 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,10 @@ version=version.VERSION, description='Email reply parser', packages=['email_reply_parser'], - package_data={'email_reply_parser': ['../VERSION']}, + package_data={ + 'email_reply_parser': ['../VERSION'], + '': ['./languages_support.json'] + }, author='Royce Haynes', author_email='royce.haynes@gmail.com', url='https://github.com/zapier/email-reply-parser', @@ -32,4 +35,4 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", ] -) \ No newline at end of file +)