Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Modules
.. autofunction:: normalize
.. autofunction:: now_reign_year
.. autofunction:: num_to_thaiword
.. autofunction:: maiyamok
.. autofunction:: rank
.. autofunction:: reign_year_to_ad
.. autofunction:: remove_dangling
Expand Down
1 change: 1 addition & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from pythainlp.util.normalize import (
delete_tone,
normalize,
maiyamok,
remove_dangling,
remove_dup_spaces,
remove_repeat_vowels,
Expand Down
44 changes: 44 additions & 0 deletions pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
Text normalization
"""
import re
from typing import List, Union
import warnings

from pythainlp import thai_above_vowels as above_v
from pythainlp import thai_below_vowels as below_v
from pythainlp import thai_follow_vowels as follow_v
from pythainlp import thai_lead_vowels as lead_v
from pythainlp import thai_tonemarks as tonemarks
from pythainlp.tokenize import word_tokenize


_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
Expand Down Expand Up @@ -254,3 +256,45 @@ def delete_tone(text: str) -> str:
DeprecationWarning,
)
return remove_tonemark(text)


def maiyamok(sent: Union[str, List[str]]) -> List[str]:
"""
Thai MaiYaMok

MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
This function is preprocessing MaiYaMok in Thai sentence.

:param Union[str, List[str]] sent: input sentence (list or str)
:return: List of words
:rtype: List[str]

:Example:
::

from pythainlp.util import maiyamok

maiyamok("เด็กๆชอบไปโรงเรียน")
# output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']

maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
# output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
"""
if isinstance(sent, str):
sent = word_tokenize(sent)
_list_word = []
i = 0
for j, text in enumerate(sent):
if text.isspace() and "ๆ" in sent[j+1]:
continue
if " ๆ" in text:
text = text.replace(" ๆ", "ๆ")
if "ๆ" == text:
text = _list_word[i-1]
elif "ๆ" in text:
text = text.replace("ๆ", "")
_list_word.append(text)
i += 1
_list_word.append(text)
i += 1
return _list_word
52 changes: 52 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
normalize,
now_reign_year,
num_to_thaiword,
maiyamok,
rank,
reign_year_to_ad,
remove_dangling,
Expand Down Expand Up @@ -532,6 +533,57 @@ def test_normalize(self):
self.assertEqual(remove_zw("\u200bกา"), "กา")
self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")

# maiyamok
self.assertEqual(
maiyamok("เด็กๆชอบไปโรงเรียน"),
['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
)
self.assertEqual(
maiyamok([
"ทำไม",
"คน",
"ดี",
" ",
"ๆ",
"ๆ",
" ",
"ถึง",
"ทำ",
"ไม่ได้"
]),
["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
)
self.assertEqual(
maiyamok([
"ทำไม",
"คน",
"ดี",
" ",
" ๆ",
"ๆ",
" ",
"ถึง",
"ทำ",
"ไม่ได้"
]),
["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
)
self.assertEqual(
maiyamok([
"ทำไม",
"คน",
"ดีๆ",
" ",
"ๆ",
"ๆ",
" ",
"ถึง",
"ทำ",
"ไม่ได้"
]),
["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
)

# ### pythainlp.util.thai

def test_countthai(self):
Expand Down