From a024b964aacee8f8b38f598c452889654186dd29 Mon Sep 17 00:00:00 2001 From: Alexei Date: Mon, 13 Apr 2020 19:08:09 +0300 Subject: [PATCH 1/9] fix: discard trailing whitespace when analysing reply --- email_reply_parser/__init__.py | 8 +++++--- test/emails/email_1_9.txt | 9 +++++++++ test/test_email_reply_parser.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 test/emails/email_1_9.txt diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 063f65b..fd8554c 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -94,9 +94,11 @@ def _scan_line(self, line): line - a row of text from an email message """ - is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None - is_quoted = self.QUOTED_REGEX.match(line) is not None - is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None + stripped_line = line.strip() + + is_quote_header = self.QUOTE_HDR_REGEX.match(stripped_line) is not None + is_quoted = self.QUOTED_REGEX.match(stripped_line) is not None + is_header = is_quote_header or self.HEADER_REGEX.match(stripped_line) is not None if self.fragment and len(line.strip()) == 0: if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): diff --git a/test/emails/email_1_9.txt b/test/emails/email_1_9.txt new file mode 100644 index 0000000..393ce0b --- /dev/null +++ b/test/emails/email_1_9.txt @@ -0,0 +1,9 @@ +Resource popular local capital doctor. Wish with think north shoulder stand catch. Decade many production food view only green. + +Believe concern floor treatment admit keep maintain put. + On Friday, April 3, 2020, 06:05:24 PM EDT, Vicki Davis wrote: + + +Example myself effect understand miss idea. Tonight work home policy arm time report. + +Against rest concern each hotel. Person care policy sea. Attack realize suggest save all everything scientist. diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index 8d2849b..b7e8765 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -90,6 +90,39 @@ def test_complex_body_with_one_fragment(self): self.assertEqual(1, len(message.fragments)) + def test_whitespace_before_header(self): + '''Header has whitespace at the beginning of the line. + + Seen in Yahoo! Mail (April 2020) with rich text reply. + ''' + + message = self.get_email('email_1_9') + + self.assertEqual( + 3, + len(message.fragments) + ) + + self.assertEqual( + [False, False, False], + [f.quoted for f in message.fragments] + ) + + self.assertEqual( + [False, False, False], + [f.signature for f in message.fragments] + ) + + self.assertEqual( + [False, True, False], + [f.headers for f in message.fragments] + ) + + self.assertEqual( + [False, True, True], + [f.hidden for f in message.fragments] + ) + def test_verify_reads_signature_correct(self): message = self.get_email('correct_sig') self.assertEqual(2, len(message.fragments)) From 2ca72849ff157e82eaf2c02774269678aac1d075 Mon Sep 17 00:00:00 2001 From: Alexei Date: Mon, 13 Apr 2020 19:08:52 +0300 Subject: [PATCH 2/9] test: unquoted quote --- test/emails/email_1_10.txt | 7 +++++++ test/test_email_reply_parser.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 test/emails/email_1_10.txt diff --git a/test/emails/email_1_10.txt b/test/emails/email_1_10.txt new file mode 100644 index 0000000..2b9ac3b --- /dev/null +++ b/test/emails/email_1_10.txt @@ -0,0 +1,7 @@ +Base tax cost environment side. May house most director treatment call heavy. +Forward professional woman institution happen. Tell girl hope to. Wrong perhaps apply anything expert main indeed. + +On Monday, April 13, 2020, 06:49:16 PM GMT+3, Paige Lee wrote: + +Thank experience bag memory hundred understand of. Environmental lose probably majority peace behind. When produce ask tough. +Institution thought system class nice instead speak. diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index b7e8765..184ca59 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -123,6 +123,39 @@ def test_whitespace_before_header(self): [f.hidden for f in message.fragments] ) + def test_quote_not_quoted(self): + '''Original email is not quoted at all. + + Seen in Yahoo! Mail (April 2020) with plain text reply. + ''' + + message = self.get_email('email_1_10') + + self.assertEqual( + 3, + len(message.fragments) + ) + + self.assertEqual( + [False, False, False], + [f.quoted for f in message.fragments] + ) + + self.assertEqual( + [False, False, False], + [f.signature for f in message.fragments] + ) + + self.assertEqual( + [False, True, False], + [f.headers for f in message.fragments] + ) + + self.assertEqual( + [False, True, True], + [f.hidden for f in message.fragments] + ) + def test_verify_reads_signature_correct(self): message = self.get_email('correct_sig') self.assertEqual(2, len(message.fragments)) From d1fe5cb7046518d90ae119ecb07e192b1227878e Mon Sep 17 00:00:00 2001 From: Alexei Date: Mon, 13 Apr 2020 20:29:28 +0300 Subject: [PATCH 3/9] fix: newline handling --- email_reply_parser/__init__.py | 2 +- test/test_email_reply_parser.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index fd8554c..20a5770 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -47,7 +47,7 @@ class EmailMessage(object): def __init__(self, text): self.fragments = [] self.fragment = None - self.text = text.replace('\r\n', '\n') + self.text = '\n'.join(text.splitlines()) self.found_visible = False def read(self): diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index 184ca59..a160065 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -232,17 +232,17 @@ def test_multiple_on(self): self.assertTrue(re.match('^On 9 Jan 2014', message.fragments[1].content)) self.assertEqual( - [False, True, False], + [False, True], [fragment.quoted for fragment in message.fragments] ) self.assertEqual( - [False, False, False], + [False, False], [fragment.signature for fragment in message.fragments] ) self.assertEqual( - [False, True, True], + [False, True], [fragment.hidden for fragment in message.fragments] ) From 8237df9bc40295d34af578d4596275554d27a973 Mon Sep 17 00:00:00 2001 From: Alexei Date: Tue, 14 Apr 2020 18:54:47 +0300 Subject: [PATCH 4/9] fix: use previously declared variable to reduce method calls --- email_reply_parser/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 20a5770..7606701 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -100,14 +100,14 @@ def _scan_line(self, line): is_quoted = self.QUOTED_REGEX.match(stripped_line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(stripped_line) is not None - if self.fragment and len(line.strip()) == 0: + if self.fragment and len(stripped_line) == 0: if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): self.fragment.signature = True self._finish_fragment() if self.fragment \ and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or - (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): + (self.fragment.quoted and (is_quote_header or len(stripped_line) == 0))): self.fragment.lines.append(line) else: From af3304d6d9871643ed6670a2a6a1f5990dba3352 Mon Sep 17 00:00:00 2001 From: Alexei Date: Tue, 14 Apr 2020 19:25:03 +0300 Subject: [PATCH 5/9] chore: bump version --- email_reply_parser/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/version.py b/email_reply_parser/version.py index eaf6e6a..e8acd09 100644 --- a/email_reply_parser/version.py +++ b/email_reply_parser/version.py @@ -1 +1 @@ -VERSION = '0.5.11' +VERSION = '0.5.12' From d3bab79e2be7b269e01c94d068da78c45b532c4f Mon Sep 17 00:00:00 2001 From: Alexei Date: Thu, 16 Apr 2020 11:11:28 +0300 Subject: [PATCH 6/9] test: check replies as well --- test/test_email_reply_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index a160065..cefcf6a 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -123,6 +123,14 @@ def test_whitespace_before_header(self): [f.hidden for f in message.fragments] ) + self.assertEqual( + ("Resource popular local capital doctor. " + "Wish with think north shoulder stand catch. " + "Decade many production food view only green.\n" + "\n" + "Believe concern floor treatment admit keep maintain put."), + message.reply) + def test_quote_not_quoted(self): '''Original email is not quoted at all. @@ -156,6 +164,14 @@ def test_quote_not_quoted(self): [f.hidden for f in message.fragments] ) + self.assertEqual( + ("Base tax cost environment side. " + "May house most director treatment call heavy.\n" + "Forward professional woman institution happen. " + "Tell girl hope to. " + "Wrong perhaps apply anything expert main indeed."), + message.reply) + def test_verify_reads_signature_correct(self): message = self.get_email('correct_sig') self.assertEqual(2, len(message.fragments)) From 44b5538e0543fa959cc1e23c463cedaac17bda74 Mon Sep 17 00:00:00 2001 From: Alexei Date: Thu, 16 Apr 2020 11:12:18 +0300 Subject: [PATCH 7/9] test: include test case for multiline header seen in GMail --- test/emails/email_1_11.txt | 9 ++++++++ test/test_email_reply_parser.py | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 test/emails/email_1_11.txt diff --git a/test/emails/email_1_11.txt b/test/emails/email_1_11.txt new file mode 100644 index 0000000..11988f7 --- /dev/null +++ b/test/emails/email_1_11.txt @@ -0,0 +1,9 @@ +Admit high represent movement. +Everything car rest perform late either among. Available help threat across spring necessary. +Develop line class impact pick generation. Join day design simply. + +On Tue, Apr 14, 2020 at 6:13 PM Alexandru via Sailo +wrote: + +Music easy though onto form top run agency. Arrive senior away total help. Foot partner between store energy out. +Water stock garden just. Skill design condition after why ten executive. diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index cefcf6a..197fb06 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -172,6 +172,47 @@ def test_quote_not_quoted(self): "Wrong perhaps apply anything expert main indeed."), message.reply) + def test_header_on_multiple_lines(self): + '''Header is split into multiple lines + + Seen in GMail (April 2020); line length was 78 fwiw + ''' + + message = self.get_email('email_1_11') + + self.assertEqual( + 3, + len(message.fragments) + ) + + self.assertEqual( + [False, False, False], + [f.quoted for f in message.fragments] + ) + + self.assertEqual( + [False, False, False], + [f.signature for f in message.fragments] + ) + + self.assertEqual( + [False, True, False], + [f.headers for f in message.fragments] + ) + + self.assertEqual( + [False, True, True], + [f.hidden for f in message.fragments] + ) + + self.assertEqual( + ("Admit high represent movement.\n" + "Everything car rest perform late either among. " + "Available help threat across spring necessary.\n" + "Develop line class impact pick generation. " + "Join day design simply."), + message.reply) + def test_verify_reads_signature_correct(self): message = self.get_email('correct_sig') self.assertEqual(2, len(message.fragments)) From c97e45e28233e0b5dfd1d6555e79f89863afbd64 Mon Sep 17 00:00:00 2001 From: Alexei Date: Thu, 16 Apr 2020 11:20:40 +0300 Subject: [PATCH 8/9] fix: strip trailing spaces for quote headers only --- email_reply_parser/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 7606701..1623471 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -97,8 +97,8 @@ def _scan_line(self, line): stripped_line = line.strip() is_quote_header = self.QUOTE_HDR_REGEX.match(stripped_line) is not None - is_quoted = self.QUOTED_REGEX.match(stripped_line) is not None - is_header = is_quote_header or self.HEADER_REGEX.match(stripped_line) is not None + is_quoted = self.QUOTED_REGEX.match(line) is not None + is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None if self.fragment and len(stripped_line) == 0: if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): From 6d6a0d7ab8b6fffe285fee6d44370066e151f40b Mon Sep 17 00:00:00 2001 From: Alexei Date: Thu, 16 Apr 2020 11:29:45 +0300 Subject: [PATCH 9/9] chore: bump version --- email_reply_parser/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_reply_parser/version.py b/email_reply_parser/version.py index e8acd09..3139180 100644 --- a/email_reply_parser/version.py +++ b/email_reply_parser/version.py @@ -1 +1 @@ -VERSION = '0.5.12' +VERSION = '0.5.13'