From 75c394520cf33e28ed4f8999467847d2fdd308ca Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 12:00:14 +0700 Subject: [PATCH 1/6] Add maiyamok --- docs/api/util.rst | 1 + pythainlp/util/__init__.py | 1 + pythainlp/util/normalize.py | 62 +++++++++++++++++++++++++++++++++++++ tests/test_util.py | 11 +++++++ 4 files changed, 75 insertions(+) diff --git a/docs/api/util.rst b/docs/api/util.rst index ec70e2463..56ed67e93 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -23,6 +23,7 @@ Modules .. autofunction:: normalize .. autofunction:: now_reign_year .. autofunction:: num_to_thaiword +.. autofunction:: maiyamok .. autofunction:: rank .. autofunction:: reign_year_to_ad .. autofunction:: remove_dangling diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d07cffb5b..c3ae06a91 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -69,6 +69,7 @@ from pythainlp.util.normalize import ( delete_tone, normalize, + maiyamok, remove_dangling, remove_dup_spaces, remove_repeat_vowels, diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index d2492c50f..d47b55042 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -3,6 +3,7 @@ Text normalization """ import re +from typing import List, Union import warnings from pythainlp import thai_above_vowels as above_v @@ -10,6 +11,7 @@ from pythainlp import thai_follow_vowels as follow_v from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks +from pythainlp.tokenize import word_tokenize _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" @@ -45,6 +47,24 @@ _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*") +_list_phrase="""ไฟไหม้ +ในแต่ละวัน +ในชั่วพริบตา +เวรกรรม +กรรมเวร +วันหนึ่ง +อ่านหนังสือ +กินข้าว +ดีแต่พูด +กล้วยไม้ป่า +ออกดอกสะพรั่ง +สนุกสนาน +ร่ำรวย +ก้องกังวาน +ทำมาหากิน +มากมาย""".splitlines() +_maiyamok_rule="|".join(_list_phrase) + def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS return matchobj.group(0)[-1] @@ -254,3 +274,45 @@ def delete_tone(text: str) -> str: DeprecationWarning, ) return remove_tonemark(text) + + +def maiyamok(sent: Union[str, List[str]]) -> List[str]: + """ + Thai MaiYaMok + + MaiYaMok (ๆ) is the mark of duplicate word in Thai language. + This function is preprocessing MaiYaMok in Thai sentence. + + :param Union[str, List[str]] sent: input sentence (list or str) + :return: List of words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import maiyamok + + maiyamok("เด็กๆชอบไปโรงเรียน") + # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] + + maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]) + # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้'] + """ + if isinstance(sent, str): + sent = word_tokenize(sent) + _list_word = [] + i=0 + for j,text in enumerate(sent): + if text == " " and sent[j+1] == "ๆ": + continue + if " ๆ" in text: + text = text.replace(" ๆ", "ๆ") + if "ๆ" == text: + text = _list_word[i-1] + elif "ๆ" in text: + text = text.replace("ๆ", "") + _list_word.append(text) + i += 1 + _list_word.append(text) + i += 1 + return _list_word diff --git a/tests/test_util.py b/tests/test_util.py index 0bde5da8c..ae9a5fde4 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -28,6 +28,7 @@ normalize, now_reign_year, num_to_thaiword, + maiyamok, rank, reign_year_to_ad, remove_dangling, @@ -532,6 +533,16 @@ def test_normalize(self): self.assertEqual(remove_zw("\u200bกา"), "กา") self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา") + # maiyamok + self.assertEqual( + maiyamok("เด็กๆชอบไปโรงเรียน"), + ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] + ) + self.assertEqual( + maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]), + ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้'] + ) + # ### pythainlp.util.thai def test_countthai(self): From 48a7c6608d399617f2a9cf4c3a59653e36b5e45f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 12:03:34 +0700 Subject: [PATCH 2/6] Update test_util.py --- tests/test_util.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index ae9a5fde4..67a7fc01c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -539,8 +539,19 @@ def test_normalize(self): ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] ) self.assertEqual( - maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]), - ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้'] + maiyamok([ + "ทำไม", + "คน", + "ดี", + " ", + "ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] ) # ### pythainlp.util.thai From 65e01e756d266a06a0b104b354e46ca02c4cdfbe Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 12:04:20 +0700 Subject: [PATCH 3/6] Update normalize.py --- pythainlp/util/normalize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index d47b55042..bfafe6aad 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -47,7 +47,7 @@ _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*") -_list_phrase="""ไฟไหม้ +_list_phrase = """ไฟไหม้ ในแต่ละวัน ในชั่วพริบตา เวรกรรม @@ -63,7 +63,7 @@ ก้องกังวาน ทำมาหากิน มากมาย""".splitlines() -_maiyamok_rule="|".join(_list_phrase) +_maiyamok_rule = "|".join(_list_phrase) def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS @@ -301,8 +301,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: if isinstance(sent, str): sent = word_tokenize(sent) _list_word = [] - i=0 - for j,text in enumerate(sent): + i = 0 + for j, text in enumerate(sent): if text == " " and sent[j+1] == "ๆ": continue if " ๆ" in text: From c72fe2dc7bc26d82cbc1bbcd015ba2b8829220ad Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 13:52:29 +0700 Subject: [PATCH 4/6] Update test_util.py --- tests/test_util.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_util.py b/tests/test_util.py index 67a7fc01c..7bdf2ae47 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -553,6 +553,36 @@ def test_normalize(self): ]), ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] ) + self.assertEqual( + maiyamok([ + "ทำไม", + "คน", + "ดี", + " ", + " ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ) + self.assertEqual( + maiyamok([ + "ทำไม", + "คน", + "ดีๆ", + " ", + "ๆ", + "ๆ", + " ", + "ถึง", + "ทำ", + "ไม่ได้" + ]), + ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ) # ### pythainlp.util.thai From 8625cc9cc9158b50308c936140e14eea111dc70c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 13:54:12 +0700 Subject: [PATCH 5/6] Update normalize.py --- pythainlp/util/normalize.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index bfafe6aad..4c99e3f16 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -47,24 +47,6 @@ _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*") -_list_phrase = """ไฟไหม้ -ในแต่ละวัน -ในชั่วพริบตา -เวรกรรม -กรรมเวร -วันหนึ่ง -อ่านหนังสือ -กินข้าว -ดีแต่พูด -กล้วยไม้ป่า -ออกดอกสะพรั่ง -สนุกสนาน -ร่ำรวย -ก้องกังวาน -ทำมาหากิน -มากมาย""".splitlines() -_maiyamok_rule = "|".join(_list_phrase) - def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS return matchobj.group(0)[-1] From ead5b8b71fdbe9dce061b1cb7e862561155d6147 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 Oct 2021 18:00:40 +0700 Subject: [PATCH 6/6] Fixed maiyamok bug --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 4c99e3f16..c555c12aa 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -285,7 +285,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: _list_word = [] i = 0 for j, text in enumerate(sent): - if text == " " and sent[j+1] == "ๆ": + if text.isspace() and "ๆ" in sent[j+1]: continue if " ๆ" in text: text = text.replace(" ๆ", "ๆ")