PyThaiNLP · wannaphong · Nov 5, 2021 · Oct 31, 2021 · Oct 31, 2021 · Oct 31, 2021
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -23,6 +23,7 @@ Modules
 .. autofunction:: normalize
 .. autofunction:: now_reign_year
 .. autofunction:: num_to_thaiword
+.. autofunction:: maiyamok
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
 .. autofunction:: remove_dangling

diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -69,6 +69,7 @@
 from pythainlp.util.normalize import (
     delete_tone,
     normalize,
+    maiyamok,
     remove_dangling,
     remove_dup_spaces,
     remove_repeat_vowels,

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
@@ -3,13 +3,15 @@
 Text normalization
 """
 import re
+from typing import List, Union
 import warnings
 
 from pythainlp import thai_above_vowels as above_v
 from pythainlp import thai_below_vowels as below_v
 from pythainlp import thai_follow_vowels as follow_v
 from pythainlp import thai_lead_vowels as lead_v
 from pythainlp import thai_tonemarks as tonemarks
+from pythainlp.tokenize import word_tokenize
 
 
 _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
@@ -254,3 +256,45 @@ def delete_tone(text: str) -> str:
         DeprecationWarning,
     )
     return remove_tonemark(text)
+
+
+def maiyamok(sent: Union[str, List[str]]) -> List[str]:
+    """
+    Thai MaiYaMok
+
+    MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
+    This function is preprocessing MaiYaMok in Thai sentence.
+
+    :param Union[str, List[str]] sent: input sentence (list or str)
+    :return: List of words
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util import maiyamok
+
+        maiyamok("เด็กๆชอบไปโรงเรียน")
+        # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
+
+        maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
+        # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
+    """
+    if isinstance(sent, str):
+        sent = word_tokenize(sent)
+    _list_word = []
+    i = 0
+    for j, text in enumerate(sent):
+        if text.isspace() and "ๆ" in sent[j+1]:
+            continue
+        if " ๆ" in text:
+            text = text.replace(" ๆ", "ๆ")
+        if "ๆ" == text:
+            text = _list_word[i-1]
+        elif "ๆ" in text:
+            text = text.replace("ๆ", "")
+            _list_word.append(text)
+            i += 1
+        _list_word.append(text)
+        i += 1
+    return _list_word
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -28,6 +28,7 @@
     normalize,
     now_reign_year,
     num_to_thaiword,
+    maiyamok,
     rank,
     reign_year_to_ad,
     remove_dangling,
@@ -532,6 +533,57 @@ def test_normalize(self):
         self.assertEqual(remove_zw("\u200bกา"), "กา")
         self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
 
+        # maiyamok
+        self.assertEqual(
+            maiyamok("เด็กๆชอบไปโรงเรียน"),
+            ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
+        )
+        self.assertEqual(
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดี",
+                " ",
+                "ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+        )
+        self.assertEqual(
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดี",
+                " ",
+                " ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+        )
+        self.assertEqual(
+            maiyamok([
+                "ทำไม",
+                "คน",
+                "ดีๆ",
+                " ",
+                "ๆ",
+                "ๆ",
+                " ",
+                "ถึง",
+                "ทำ",
+                "ไม่ได้"
+            ]),
+            ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+        )
+
     # ### pythainlp.util.thai
 
     def test_countthai(self):