Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
87 commits
Select commit Hold shift + click to select a range
b919419
language support
atc0m Feb 21, 2019
2517a41
json conf
atc0m Feb 21, 2019
54ef8f5
Merge pull request #1 from atc0m/language-support
atc0m Feb 21, 2019
da6ea80
languages support path
atc0m Mar 6, 2019
d5a8581
Merge pull request #2 from atc0m/language-support
atc0m Mar 6, 2019
38da775
move json into parser module
atc0m Mar 6, 2019
ba8f818
Merge pull request #3 from atc0m/language-support
atc0m Mar 6, 2019
929e78c
dir_path
atc0m Mar 6, 2019
c05378d
Merge pull request #4 from atc0m/language-support
atc0m Mar 6, 2019
aa5b02b
os
atc0m Mar 6, 2019
44250ed
Merge pull request #5 from atc0m/language-support
atc0m Mar 6, 2019
7fdecf3
build with language support
atc0m Mar 29, 2019
d8dd270
Merge pull request #6 from atc0m/language-support
atc0m Mar 29, 2019
ffca453
load json once
atc0m Mar 29, 2019
1eb7e86
mailto
atc0m Mar 29, 2019
6569397
Merge pull request #7 from atc0m/opt
atc0m Mar 29, 2019
0d50859
dir path
atc0m Mar 29, 2019
e87cf1e
Merge pull request #8 from atc0m/opt
atc0m Mar 29, 2019
ea11d23
mailto
atc0m Mar 29, 2019
cad4130
Merge pull request #9 from atc0m/opt
atc0m Mar 29, 2019
83b2361
mailto
atc0m Mar 29, 2019
6393dfe
refactor
atc0m Mar 29, 2019
4ca4624
Merge pull request #10 from atc0m/opt
atc0m Mar 29, 2019
ba88be3
french support
atc0m May 2, 2019
6227114
fix key
atc0m May 2, 2019
8d3bf59
cordialement regex
atc0m May 2, 2019
c9b1fc7
french signatures
atc0m May 13, 2019
d1915e6
ignore empty lines
atc0m May 13, 2019
7b28cc1
correct translation
atc0m Sep 4, 2019
6fc5df3
multiple signatures
atc0m Sep 5, 2019
6cc9442
multi quote header rgx
atc0m Sep 5, 2019
2d60020
html escaped brackets
atc0m Sep 5, 2019
5c2e1ce
multi quote fix
atc0m Sep 5, 2019
80d0bf8
syntax
atc0m Sep 5, 2019
44c3b52
selected character
atc0m Sep 6, 2019
1a9dc52
rm
atc0m Sep 6, 2019
5d372b5
Merge pull request #11 from atc0m/finnish
atc0m Sep 6, 2019
15b03cd
single quote header
atc0m Sep 9, 2019
ba3e0c7
Merge pull request #12 from atc0m/finnish-update
atc0m Sep 9, 2019
ee03139
whitespace
atc0m Sep 26, 2019
a75fb66
Merge pull request #13 from atc0m/header-fix
atc0m Sep 26, 2019
4360c6c
omit sigs, fix header regex
atc0m Nov 14, 2019
9b8bf60
signature appended to hidden fragment
atc0m Nov 14, 2019
aae3d44
Merge pull request #14 from atc0m/signature-header-fix
atc0m Nov 15, 2019
e52ba6d
finish fragment after signature
atc0m Nov 15, 2019
33da801
Merge pull request #15 from atc0m/signature-header-fix
atc0m Nov 15, 2019
915f2c0
test txts
atc0m Jan 23, 2020
2851339
ignore json
atc0m Jan 23, 2020
6347b2b
warnings regex
atc0m Jan 23, 2020
667192d
warnings
atc0m Jan 23, 2020
45c38a4
Merge pull request #16 from atc0m/warnings
atc0m Jan 23, 2020
b68075f
notice, do not reply
atc0m Jan 29, 2020
35f8927
Merge pull request #17 from atc0m/caution-extended
atc0m Jan 29, 2020
e25be22
more warnings
atc0m Jan 30, 2020
27a92f2
character syntax fix
atc0m Jan 30, 2020
a746d24
warnings extended
atc0m Feb 18, 2020
dd2d657
Merge pull request #18 from atc0m/warning2
atc0m Feb 18, 2020
7f742e6
extend
atc0m Mar 5, 2020
95c8a9a
Merge pull request #19 from atc0m/legal
atc0m Mar 5, 2020
ceb175d
confidential
atc0m Mar 5, 2020
ecc0f1f
Merge pull request #20 from atc0m/legal
atc0m Mar 5, 2020
36ccc98
communication
atc0m Mar 5, 2020
5fdae32
Merge branch 'master' into legal2
atc0m Mar 5, 2020
ad948c9
Merge pull request #21 from atc0m/legal2
atc0m Mar 5, 2020
b7da5c8
extra quote
atc0m Mar 5, 2020
40b2c25
Merge pull request #22 from atc0m/legal2
atc0m Mar 5, 2020
0017f9f
more quote signs
atc0m Mar 5, 2020
dd51d4e
rm stop
atc0m Mar 5, 2020
1adc119
Merge pull request #23 from atc0m/legal2
atc0m Mar 5, 2020
be17320
chinese update
atc0m Mar 31, 2020
696b6d1
french extension
atc0m Apr 2, 2020
a0cbc45
french extension
atc0m Apr 2, 2020
62a81ca
japanese quoted header support
atc0m Apr 3, 2020
f8c51af
chinese sent from
atc0m Apr 3, 2020
55ebfbc
add follow up to quote
atc0m Jun 11, 2020
7d97bed
Merge pull request #24 from atc0m/zendesk-followup
atc0m Jun 11, 2020
f24576b
forward/multi header test
atc0m Jun 19, 2020
2d4765c
Merge pull request #25 from atc0m/multi-quote
atc0m Jun 19, 2020
4ec9dd7
confidentiality notice variations
atc0m Oct 23, 2020
8a74f85
Merge pull request #26 from atc0m/confidentail_footer
atc0m Oct 23, 2020
95365d3
styling
atc0m Oct 23, 2020
8235343
only replace follow up if it's not the first line
atc0m Oct 23, 2020
00ae300
strip whitespace
atc0m Oct 23, 2020
8ee7a5f
Merge pull request #27 from atc0m/follow-up
atc0m Oct 23, 2020
2d4beb1
webform subject fix
atc0m Dec 30, 2020
35ecf1c
Merge pull request #28 from atc0m/webform
atc0m Dec 30, 2020
aef3fe7
added spanish stuff
mohamedalani Mar 8, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ tests/.DS_Store
.DS_Store
*.egg-info
.project
env/
venv/
dist/
dist/*

*.csv
__pycache__/
*.json
177 changes: 128 additions & 49 deletions email_reply_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,172 @@
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.

For more information, visit https://github.com/zapier/email-reply-parser
email_reply_parser is a python library port of GitHub's Email Reply Parser.
For more information, visit https://github.com/zapier/email_reply_parser
"""

import os
import re
import json


class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
def __init__(self, language='en'):
dir_path = os.path.dirname(__file__)
with open(dir_path + "/languages_support.json", "r") as read_file:
self.words_map = json.load(read_file)
if language in self.words_map:
self.language = language
else:
self.language = 'en'

@staticmethod
def read(text):
def read(self, text):
""" Factory method that splits email into list of fragments

text - A string email body

Returns an EmailMessage instance
"""
return EmailMessage(text).read()
return EmailMessage(text, self.language, self.words_map).read()

@staticmethod
def parse_reply(text):
def parse_reply(self, text):
""" Provides the reply portion of email.

text - A string email body

Returns reply body message
"""
return EmailReplyParser.read(text).reply
return self.read(text).reply


class EmailMessage(object):
""" An email message represents a parsed email body.
"""

SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)

def __init__(self, text):
def __init__(self, text, language, words_map):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
self.SIG_REGEX = None
self.QUOTE_HDR_REGEX = None
self.QUOTED_REGEX = None
self.HEADER_REGEX = None
self._MULTI_QUOTE_HDR_REGEX = None
self.MULTI_QUOTE_HDR_REGEX = None
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None
self.words_map = words_map
self.language = language
self.default_language = 'en'
self.set_regex()

def default_quoted_header(self):
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(
r'^[* ]?(' + self.words_map[self.language]['From'] \
+ '|' + self.words_map[self.language]['Sent'] \
+ '|' + self.words_map[self.language]['To']
+ ')\s*:\*? .+|.+(mailto:).+'
)

def warnings(self):
self.WARNING_REGEX = re.compile(
r'(CAUTION:|NOTICE:|Confidentiality Notice:|Please do not reply|This electronic mail'
r'|Disclaimer: This message is intended'
r'|This message and any attachments are solely'
r'|This email contains privileged information'
r'|The information contained|This email has been scanned|This message and any associated files'
r'|This email and any files transmitted|This message is for the recipients'
r'|The information provided within this communication'
r'|This message (including any attachments) is intended'
r'|The [cC]ontents are confidential|This communication with its contents'
r'|Please consider the environment before printing this email) [a-zA-Z0-9:;.,?!()@&/\'\"\“\” \-]*'
)

def nl_support(self):
self.SIG_REGEX = re.compile(
r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})'
)
self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$')
self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)'

def de_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$')
self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)'

def fr_support(self):
self.SIG_REGEX = re.compile(
r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \
+ '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations'
r'|cdlt|cdt|crdt|regards|best regard|bonne journ[ée]e))',
re.IGNORECASE
)
self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$')
self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ -]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ -]*:)'

def en_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from (\w+\s*){1,6})')
self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$')
self.QUOTED_REGEX = re.compile(r'(>+)|((&gt;)+)')
self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)'

def es_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Enviado desde (\w+\s*){1,6})')
self.QUOTE_HDR_REGEX = re.compile('\s*El.*escribió\s*:$')
self._MULTI_QUOTE_HDR_REGEX = r'(?!El.*El\s.+?escribió\s*:)(El\s(.+?)escribió\s*:)'

def ja_support(self):
self.SIG_REGEX = re.compile(r'--|__|-\w')
self.QUOTE_HDR_REGEX = re.compile(
r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$'
)
self.QUOTED_REGEX = re.compile(r'(>+)|((&gt;)+)')
self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' # Dummy multiline: doesnt work for japanese due to BeautifulSoup insreting new lines before ":" character

def fi_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)')
self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello.+?)?:)')
self.QUOTED_REGEX = re.compile(r'(>+)|((&gt;)+)')
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)'

def set_regex(self):
if hasattr(self, self.language+"_support"):
getattr(self, self.language+"_support")()
self.default_quoted_header()
else:
self.SIG_REGEX = re.compile(
r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] \
+ '|' + self.words_map[self.default_language]['Sent from'] \
+ ')(\w+\s*){1,3})'
)
self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$')
self.default_quoted_header()
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] \
+ '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)'
self.warnings()
self.FOLLOW_UP_HDR_REGEX = re.compile(r'(?<!^)This is a follow-up to your previous request.*', re.DOTALL)
self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL)

def read(self):
""" Creates new fragment for each line
and labels as a signature, quote, or hidden.

Returns EmailMessage instance
"""

self.text = self.text.strip()
self.found_visible = False

is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
if is_multi_quote_header:
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)

self.text = self.FOLLOW_UP_HDR_REGEX.sub('', self.text)
# Fix any outlook style replies, with the reply immediately above the signature boundary line
# See email_2_2.txt for an example
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)

self.text = re.sub(self.WARNING_REGEX, '\n', self.text)
self.lines = self.text.split('\n')
self.lines.reverse()

for line in self.lines:
self._scan_line(line)
if line.strip():
self._scan_line(line.strip())

self._finish_fragment()

self.fragments.reverse()

return self
Expand All @@ -85,42 +177,29 @@ def reply(self):
"""
reply = []
for f in self.fragments:
if not (f.hidden or f.quoted):
if not (f.hidden or f.quoted or f.signature):
reply.append(f.content)
return '\n'.join(reply)

def _scan_line(self, line):
""" Reviews each line in email message and determines fragment type

line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None

if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
self.fragment.signature = True
self._finish_fragment()

if self.fragment \
if self.fragment and self.SIG_REGEX.match(line.strip()):
self.fragment.signature = True
self.fragment.lines.append(line)
self._finish_fragment()
elif self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):

self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)

def quote_header(self, line):
""" Determines whether line is part of a quoted area

line - a row of the email message

Returns True or False
"""
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None

def _finish_fragment(self):
""" Creates fragment
"""
Expand Down
Loading