Skip to content

Commit 40af25d

Browse files
authored
Merge pull request #623 from PyThaiNLP/add-clean-maiyamok
Add maiyamok
2 parents 6135ba5 + ead5b8b commit 40af25d

File tree

4 files changed

+98
-0
lines changed

4 files changed

+98
-0
lines changed

docs/api/util.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Modules
2323
.. autofunction:: normalize
2424
.. autofunction:: now_reign_year
2525
.. autofunction:: num_to_thaiword
26+
.. autofunction:: maiyamok
2627
.. autofunction:: rank
2728
.. autofunction:: reign_year_to_ad
2829
.. autofunction:: remove_dangling

pythainlp/util/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
from pythainlp.util.normalize import (
7070
delete_tone,
7171
normalize,
72+
maiyamok,
7273
remove_dangling,
7374
remove_dup_spaces,
7475
remove_repeat_vowels,

pythainlp/util/normalize.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
Text normalization
44
"""
55
import re
6+
from typing import List, Union
67
import warnings
78

89
from pythainlp import thai_above_vowels as above_v
910
from pythainlp import thai_below_vowels as below_v
1011
from pythainlp import thai_follow_vowels as follow_v
1112
from pythainlp import thai_lead_vowels as lead_v
1213
from pythainlp import thai_tonemarks as tonemarks
14+
from pythainlp.tokenize import word_tokenize
1315

1416

1517
_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
@@ -254,3 +256,45 @@ def delete_tone(text: str) -> str:
254256
DeprecationWarning,
255257
)
256258
return remove_tonemark(text)
259+
260+
261+
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
262+
"""
263+
Thai MaiYaMok
264+
265+
MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
266+
This function is preprocessing MaiYaMok in Thai sentence.
267+
268+
:param Union[str, List[str]] sent: input sentence (list or str)
269+
:return: List of words
270+
:rtype: List[str]
271+
272+
:Example:
273+
::
274+
275+
from pythainlp.util import maiyamok
276+
277+
maiyamok("เด็กๆชอบไปโรงเรียน")
278+
# output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
279+
280+
maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
281+
# output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
282+
"""
283+
if isinstance(sent, str):
284+
sent = word_tokenize(sent)
285+
_list_word = []
286+
i = 0
287+
for j, text in enumerate(sent):
288+
if text.isspace() and "ๆ" in sent[j+1]:
289+
continue
290+
if " ๆ" in text:
291+
text = text.replace(" ๆ", "ๆ")
292+
if "ๆ" == text:
293+
text = _list_word[i-1]
294+
elif "ๆ" in text:
295+
text = text.replace("ๆ", "")
296+
_list_word.append(text)
297+
i += 1
298+
_list_word.append(text)
299+
i += 1
300+
return _list_word

tests/test_util.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
normalize,
2929
now_reign_year,
3030
num_to_thaiword,
31+
maiyamok,
3132
rank,
3233
reign_year_to_ad,
3334
remove_dangling,
@@ -532,6 +533,57 @@ def test_normalize(self):
532533
self.assertEqual(remove_zw("\u200bกา"), "กา")
533534
self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
534535

536+
# maiyamok
537+
self.assertEqual(
538+
maiyamok("เด็กๆชอบไปโรงเรียน"),
539+
['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
540+
)
541+
self.assertEqual(
542+
maiyamok([
543+
"ทำไม",
544+
"คน",
545+
"ดี",
546+
" ",
547+
"ๆ",
548+
"ๆ",
549+
" ",
550+
"ถึง",
551+
"ทำ",
552+
"ไม่ได้"
553+
]),
554+
["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
555+
)
556+
self.assertEqual(
557+
maiyamok([
558+
"ทำไม",
559+
"คน",
560+
"ดี",
561+
" ",
562+
" ๆ",
563+
"ๆ",
564+
" ",
565+
"ถึง",
566+
"ทำ",
567+
"ไม่ได้"
568+
]),
569+
["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
570+
)
571+
self.assertEqual(
572+
maiyamok([
573+
"ทำไม",
574+
"คน",
575+
"ดีๆ",
576+
" ",
577+
"ๆ",
578+
"ๆ",
579+
" ",
580+
"ถึง",
581+
"ทำ",
582+
"ไม่ได้"
583+
]),
584+
["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
585+
)
586+
535587
# ### pythainlp.util.thai
536588

537589
def test_countthai(self):

0 commit comments

Comments
 (0)