14
14
from pythainlp import thai_lead_vowels as lead_v
15
15
from pythainlp import thai_tonemarks as tonemarks
16
16
from pythainlp .tokenize import word_tokenize
17
+ from pythainlp .tools import warn_deprecation
17
18
18
19
_DANGLING_CHARS = f"{ above_v } { below_v } { tonemarks } \u0e3a \u0e4c \u0e4d \u0e4e "
19
20
_RE_REMOVE_DANGLINGS = re .compile (f"^[{ _DANGLING_CHARS } ]+" )
@@ -249,12 +250,13 @@ def normalize(text: str) -> str:
249
250
return text
250
251
251
252
252
- def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253
+ def expand_maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253
254
"""
254
- Thai MaiYaMok
255
+ Expand Maiyamok.
256
+
257
+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258
+ repetition. This function preprocesses Thai text by expanding Maiyamok
255
259
256
- MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
257
- This function is preprocessing MaiYaMok in Thai sentence.
258
260
259
261
:param Union[str, List[str]] sent: input sentence (list or str)
260
262
:return: list of words
@@ -265,15 +267,12 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
265
267
266
268
from pythainlp.util import maiyamok
267
269
268
- maiyamok("เด็กๆชอบไปโรงเรียน")
269
- # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
270
-
271
- maiyamok(["ทำไม", "คน", "ดี", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"])
272
- # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
270
+ maiyamok("เด็กๆกิน")
271
+ # output: ['เด็ก', 'เด็ก', 'กิน']
273
272
"""
274
273
if isinstance (sent , str ):
275
274
sent = word_tokenize (sent )
276
- _list_word = []
275
+ _list_word : list [ str ] = []
277
276
i = 0
278
277
for j , text in enumerate (sent ):
279
278
if text .isspace () and "ๆ" in sent [j + 1 ]:
@@ -292,3 +291,28 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
292
291
_list_word .append (text )
293
292
i += 1
294
293
return _list_word
294
+
295
+
296
+ def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
297
+ """
298
+ Expand Maiyamok.
299
+
300
+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
301
+ repetition. This function preprocesses Thai text by expanding Maiyamok
302
+
303
+ :param Union[str, List[str]] sent: input sentence (list or str)
304
+ :return: list of words
305
+ :rtype: List[str]
306
+
307
+ :Example:
308
+ ::
309
+
310
+ from pythainlp.util import maiyamok
311
+
312
+ maiyamok("เด็กๆกิน")
313
+ # output: ['เด็ก', 'เด็ก', 'กิน']
314
+ """
315
+ warn_deprecation (
316
+ "pythainlp.util.maiyamok" , "pythainlp.util.expand_maiyamok"
317
+ )
318
+ return expand_maiyamok (sent )
0 commit comments