diff --git a/docs/api/util.rst b/docs/api/util.rst index ec70e2463..56ed67e93 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -23,6 +23,7 @@ Modules .. autofunction:: normalize .. autofunction:: now_reign_year .. autofunction:: num_to_thaiword +.. autofunction:: maiyamok .. autofunction:: rank .. autofunction:: reign_year_to_ad .. autofunction:: remove_dangling diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d07cffb5b..c3ae06a91 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -69,6 +69,7 @@ from pythainlp.util.normalize import ( delete_tone, normalize, + maiyamok, remove_dangling, remove_dup_spaces, remove_repeat_vowels, diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index d2492c50f..c555c12aa 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -3,6 +3,7 @@ Text normalization """ import re +from typing import List, Union import warnings from pythainlp import thai_above_vowels as above_v @@ -10,6 +11,7 @@ from pythainlp import thai_follow_vowels as follow_v from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks +from pythainlp.tokenize import word_tokenize _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" @@ -254,3 +256,45 @@ def delete_tone(text: str) -> str: DeprecationWarning, ) return remove_tonemark(text) + + +def maiyamok(sent: Union[str, List[str]]) -> List[str]: + """ + Thai MaiYaMok + + MaiYaMok (ๆ) is the mark of duplicate word in Thai language. + This function is preprocessing MaiYaMok in Thai sentence. + + :param Union[str, List[str]] sent: input sentence (list or str) + :return: List of words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import maiyamok + + maiyamok("เด็กๆชอบไปโรงเรียน") + # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] + + maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]) + # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้'] + """ + if isinstance(sent, str): + sent = word_tokenize(sent) + _list_word = [] + i = 0 + for j, text in enumerate(sent): + if text.isspace() and "ๆ" in sent[j+1]: + continue + if " ๆ" in text: + text = text.replace(" ๆ", "ๆ") + if "ๆ" == text: + text = _list_word[i-1] + elif "ๆ" in text: + text = text.replace("ๆ", "") + _list_word.append(text) + i += 1 + _list_word.append(text) + i += 1 + return _list_word diff --git a/tests/test_util.py b/tests/test_util.py index 0bde5da8c..7bdf2ae47 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -28,6 +28,7 @@ normalize, now_reign_year, num_to_thaiword, + maiyamok, rank, reign_year_to_ad, remove_dangling, @@ -532,6 +533,57 @@ def test_normalize(self): self.assertEqual(remove_zw("\u200bกา"), "กา") self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา") + # maiyamok + self.assertEqual( + maiyamok("เด็กๆชอบไปโรงเรียน"), + ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] + ) + self.assertEqual( + maiyamok([ + "ทำไม", + "คน", + "ดี", + " ", + "ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ) + self.assertEqual( + maiyamok([ + "ทำไม", + "คน", + "ดี", + " ", + " ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ) + self.assertEqual( + maiyamok([ + "ทำไม", + "คน", + "ดีๆ", + " ", + "ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ) + # ### pythainlp.util.thai def test_countthai(self):