4
4
"""
5
5
import re
6
6
from typing import Iterable , List , Union
7
+ import warnings
7
8
8
9
from pythainlp .tokenize import (
9
10
DEFAULT_SENT_TOKENIZE_ENGINE ,
@@ -302,6 +303,8 @@ def subword_tokenize(
302
303
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
303
304
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
304
305
* *wangchanberta* - SentencePiece from wangchanberta model.
306
+ * *dict* - newmm word tokenizer with a syllable dictionary
307
+ * *ssg* - CRF syllable segmenter for Thai
305
308
306
309
:Example:
307
310
@@ -346,19 +349,32 @@ def subword_tokenize(
346
349
if not text or not isinstance (text , str ):
347
350
return []
348
351
352
+ segments = []
353
+
349
354
if engine == "tcc" :
350
355
from pythainlp .tokenize .tcc import segment
351
356
elif engine == "etcc" :
352
357
from pythainlp .tokenize .etcc import segment
353
358
elif engine == "wangchanberta" :
354
359
from pythainlp .wangchanberta import segment
360
+ elif engine == "dict" : # use syllable dictionary
361
+ words = word_tokenize (text )
362
+ for word in words :
363
+ segments .extend (
364
+ word_tokenize (
365
+ text = word , custom_dict = DEFAULT_SYLLABLE_DICT_TRIE
366
+ )
367
+ )
368
+ elif engine == "ssg" :
369
+ from pythainlp .tokenize .ssg import segment
355
370
else :
356
371
raise ValueError (
357
372
f"""Tokenizer \" { engine } \" not found.
358
373
It might be a typo; if not, please consult our document."""
359
374
)
360
375
361
- segments = segment (text )
376
+ if segments == []:
377
+ segments = segment (text )
362
378
363
379
if not keep_whitespace :
364
380
segments = [token .strip (" " ) for token in segments if token .strip (" " )]
@@ -374,6 +390,8 @@ def syllable_tokenize(
374
390
"""
375
391
Syllable tokenizer.
376
392
393
+ **syllable_tokenize is deprecated, use subword_tokenize instead**
394
+
377
395
Tokenizes text into syllable (Thai: พยางค์), a unit of
378
396
pronunciation having one vowel sound. For example, the word 'รถไฟ'
379
397
contains two syallbles including 'รถ', and 'ไฟ'.
@@ -403,6 +421,11 @@ def syllable_tokenize(
403
421
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
404
422
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
405
423
"""
424
+ warnings .warn (
425
+ """syllable_tokenize will be deprecated in PyThaiNLP version 2.4,
426
+ use subword_tokenize instead""" ,
427
+ PendingDeprecationWarning
428
+ )
406
429
407
430
if not text or not isinstance (text , str ):
408
431
return []
0 commit comments