From 75c394520cf33e28ed4f8999467847d2fdd308ca Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 12:00:14 +0700
Subject: [PATCH 1/6] Add maiyamok

---
 docs/api/util.rst           |  1 +
 pythainlp/util/__init__.py  |  1 +
 pythainlp/util/normalize.py | 62 +++++++++++++++++++++++++++++++++++++
 tests/test_util.py          | 11 +++++++
 4 files changed, 75 insertions(+)

diff --git a/docs/api/util.rst b/docs/api/util.rst
index ec70e2463..56ed67e93 100644
--- a/docs/api/util.rst
+++ b/docs/api/util.rst
@@ -23,6 +23,7 @@ Modules
 .. autofunction:: normalize
 .. autofunction:: now_reign_year
 .. autofunction:: num_to_thaiword
+.. autofunction:: maiyamok
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
 .. autofunction:: remove_dangling
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index d07cffb5b..c3ae06a91 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -69,6 +69,7 @@
 from pythainlp.util.normalize import (
     delete_tone,
     normalize,
+    maiyamok,
     remove_dangling,
     remove_dup_spaces,
     remove_repeat_vowels,
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index d2492c50f..d47b55042 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -3,6 +3,7 @@
 Text normalization
 """
 import re
+from typing import List, Union
 import warnings
 
 from pythainlp import thai_above_vowels as above_v
@@ -10,6 +11,7 @@
 from pythainlp import thai_follow_vowels as follow_v
 from pythainlp import thai_lead_vowels as lead_v
 from pythainlp import thai_tonemarks as tonemarks
+from pythainlp.tokenize import word_tokenize
 
 
 _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
@@ -45,6 +47,24 @@
 
 _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
 
+_list_phrase="""ไฟไหม้
+ในแต่ละวัน
+ในชั่วพริบตา
+เวรกรรม
+กรรมเวร
+วันหนึ่ง
+อ่านหนังสือ
+กินข้าว
+ดีแต่พูด
+กล้วยไม้ป่า
+ออกดอกสะพรั่ง
+สนุกสนาน
+ร่ำรวย
+ก้องกังวาน
+ทำมาหากิน
+มากมาย""".splitlines()
+_maiyamok_rule="|".join(_list_phrase)
+
 
 def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]
@@ -254,3 +274,45 @@ def delete_tone(text: str) -> str:
         DeprecationWarning,
     )
     return remove_tonemark(text)
+
+
+def maiyamok(sent: Union[str, List[str]]) -> List[str]:
+    """
+    Thai MaiYaMok
+
+    MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
+    This function is preprocessing MaiYaMok in Thai sentence.
+
+    :param Union[str, List[str]] sent: input sentence (list or str)
+    :return: List of words
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util import maiyamok
+
+        maiyamok("เด็กๆชอบไปโรงเรียน")
+        # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
+
+        maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
+        # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
+    """
+    if isinstance(sent, str):
+        sent = word_tokenize(sent)
+    _list_word = []
+    i=0
+    for j,text in enumerate(sent):
+        if text == " " and sent[j+1] == "ๆ":
+            continue
+        if " ๆ" in text:
+            text = text.replace(" ๆ", "ๆ")
+        if "ๆ" == text:
+            text = _list_word[i-1]
+        elif "ๆ" in text:
+            text = text.replace("ๆ", "")
+            _list_word.append(text)
+            i += 1
+        _list_word.append(text)
+        i += 1
+    return _list_word
diff --git a/tests/test_util.py b/tests/test_util.py
index 0bde5da8c..ae9a5fde4 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -28,6 +28,7 @@
     normalize,
     now_reign_year,
     num_to_thaiword,
+    maiyamok,
     rank,
     reign_year_to_ad,
     remove_dangling,
@@ -532,6 +533,16 @@ def test_normalize(self):
         self.assertEqual(remove_zw("\u200bกา"), "กา")
         self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
 
+        # maiyamok
+        self.assertEqual(
+            maiyamok("เด็กๆชอบไปโรงเรียน"),
+            ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
+        )
+        self.assertEqual(
+            maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]),
+            ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
+        )
+
     # ### pythainlp.util.thai
 
     def test_countthai(self):

From 48a7c6608d399617f2a9cf4c3a59653e36b5e45f Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 12:03:34 +0700
Subject: [PATCH 2/6] Update test_util.py

---
 tests/test_util.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index ae9a5fde4..67a7fc01c 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -539,8 +539,19 @@ def test_normalize(self):
             ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
         )
         self.assertEqual(
-            maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]),
-            ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดี",
+                " ",
+                "ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
         )
 
     # ### pythainlp.util.thai

From 65e01e756d266a06a0b104b354e46ca02c4cdfbe Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 12:04:20 +0700
Subject: [PATCH 3/6] Update normalize.py

---
 pythainlp/util/normalize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index d47b55042..bfafe6aad 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -47,7 +47,7 @@
 
 _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
 
-_list_phrase="""ไฟไหม้
+_list_phrase = """ไฟไหม้
 ในแต่ละวัน
 ในชั่วพริบตา
 เวรกรรม
@@ -63,7 +63,7 @@
 ก้องกังวาน
 ทำมาหากิน
 มากมาย""".splitlines()
-_maiyamok_rule="|".join(_list_phrase)
+_maiyamok_rule = "|".join(_list_phrase)
 
 
 def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
@@ -301,8 +301,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
     if isinstance(sent, str):
         sent = word_tokenize(sent)
     _list_word = []
-    i=0
-    for j,text in enumerate(sent):
+    i = 0
+    for j, text in enumerate(sent):
         if text == " " and sent[j+1] == "ๆ":
             continue
         if " ๆ" in text:

From c72fe2dc7bc26d82cbc1bbcd015ba2b8829220ad Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 13:52:29 +0700
Subject: [PATCH 4/6] Update test_util.py

---
 tests/test_util.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/test_util.py b/tests/test_util.py
index 67a7fc01c..7bdf2ae47 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -553,6 +553,36 @@ def test_normalize(self):
             ]),
             ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
         )
+        self.assertEqual(
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดี",
+                " ",
+                " ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+        )
+        self.assertEqual(
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดีๆ",
+                " ",
+                "ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+        )
 
     # ### pythainlp.util.thai
 

From 8625cc9cc9158b50308c936140e14eea111dc70c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 13:54:12 +0700
Subject: [PATCH 5/6] Update normalize.py

---
 pythainlp/util/normalize.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index bfafe6aad..4c99e3f16 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -47,24 +47,6 @@
 
 _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
 
-_list_phrase = """ไฟไหม้
-ในแต่ละวัน
-ในชั่วพริบตา
-เวรกรรม
-กรรมเวร
-วันหนึ่ง
-อ่านหนังสือ
-กินข้าว
-ดีแต่พูด
-กล้วยไม้ป่า
-ออกดอกสะพรั่ง
-สนุกสนาน
-ร่ำรวย
-ก้องกังวาน
-ทำมาหากิน
-มากมาย""".splitlines()
-_maiyamok_rule = "|".join(_list_phrase)
-
 
 def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]

From ead5b8b71fdbe9dce061b1cb7e862561155d6147 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 31 Oct 2021 18:00:40 +0700
Subject: [PATCH 6/6] Fixed maiyamok bug

---
 pythainlp/util/normalize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 4c99e3f16..c555c12aa 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -285,7 +285,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
     _list_word = []
     i = 0
     for j, text in enumerate(sent):
-        if text == " " and sent[j+1] == "ๆ":
+        if text.isspace() and "ๆ" in sent[j+1]:
             continue
         if " ๆ" in text:
             text = text.replace(" ๆ", "ๆ")