Skip to content

Commit 036e985

Browse files
authored
Merge pull request #550 from PyThaiNLP/merge-syllable-subword
Deprecated syllable_tokenize #322
2 parents 449e9b0 + 9bf1842 commit 036e985

File tree

2 files changed

+42
-1
lines changed

2 files changed

+42
-1
lines changed

pythainlp/tokenize/core.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
import re
66
from typing import Iterable, List, Union
7+
import warnings
78

89
from pythainlp.tokenize import (
910
DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -302,6 +303,8 @@ def subword_tokenize(
302303
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
303304
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
304305
* *wangchanberta* - SentencePiece from wangchanberta model.
306+
* *dict* - newmm word tokenizer with a syllable dictionary
307+
* *ssg* - CRF syllable segmenter for Thai
305308
306309
:Example:
307310
@@ -346,19 +349,32 @@ def subword_tokenize(
346349
if not text or not isinstance(text, str):
347350
return []
348351

352+
segments = []
353+
349354
if engine == "tcc":
350355
from pythainlp.tokenize.tcc import segment
351356
elif engine == "etcc":
352357
from pythainlp.tokenize.etcc import segment
353358
elif engine == "wangchanberta":
354359
from pythainlp.wangchanberta import segment
360+
elif engine == "dict": # use syllable dictionary
361+
words = word_tokenize(text)
362+
for word in words:
363+
segments.extend(
364+
word_tokenize(
365+
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
366+
)
367+
)
368+
elif engine == "ssg":
369+
from pythainlp.tokenize.ssg import segment
355370
else:
356371
raise ValueError(
357372
f"""Tokenizer \"{engine}\" not found.
358373
It might be a typo; if not, please consult our document."""
359374
)
360375

361-
segments = segment(text)
376+
if segments == []:
377+
segments = segment(text)
362378

363379
if not keep_whitespace:
364380
segments = [token.strip(" ") for token in segments if token.strip(" ")]
@@ -374,6 +390,8 @@ def syllable_tokenize(
374390
"""
375391
Syllable tokenizer.
376392
393+
**syllable_tokenize is deprecated, use subword_tokenize instead**
394+
377395
Tokenizes text into syllable (Thai: พยางค์), a unit of
378396
pronunciation having one vowel sound. For example, the word 'รถไฟ'
379397
contains two syallbles including 'รถ', and 'ไฟ'.
@@ -403,6 +421,11 @@ def syllable_tokenize(
403421
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
404422
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
405423
"""
424+
warnings.warn(
425+
"""syllable_tokenize will be deprecated in PyThaiNLP version 2.4,
426+
use subword_tokenize instead""",
427+
PendingDeprecationWarning
428+
)
406429

407430
if not text or not isinstance(text, str):
408431
return []

tests/test_tokenize.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,24 @@ def test_subword_tokenize(self):
300300
self.assertFalse(
301301
" " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
302302
)
303+
self.assertEqual(
304+
subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"]
305+
)
306+
self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
307+
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
308+
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
309+
self.assertEqual(
310+
subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
311+
)
312+
self.assertTrue(
313+
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
314+
)
315+
self.assertFalse(
316+
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
317+
)
318+
self.assertFalse(
319+
" " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
320+
)
303321
with self.assertRaises(ValueError):
304322
subword_tokenize("นกแก้ว", engine="XX") # engine does not exist
305323

0 commit comments

Comments
 (0)