diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 3fdd66e52..b43e7915e 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -4,6 +4,7 @@ """ import re from typing import Iterable, List, Union +import warnings from pythainlp.tokenize import ( DEFAULT_SENT_TOKENIZE_ENGINE, @@ -302,6 +303,8 @@ def subword_tokenize( * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) * *wangchanberta* - SentencePiece from wangchanberta model. + * *dict* - newmm word tokenizer with a syllable dictionary + * *ssg* - CRF syllable segmenter for Thai :Example: @@ -346,19 +349,32 @@ def subword_tokenize( if not text or not isinstance(text, str): return [] + segments = [] + if engine == "tcc": from pythainlp.tokenize.tcc import segment elif engine == "etcc": from pythainlp.tokenize.etcc import segment elif engine == "wangchanberta": from pythainlp.wangchanberta import segment + elif engine == "dict": # use syllable dictionary + words = word_tokenize(text) + for word in words: + segments.extend( + word_tokenize( + text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE + ) + ) + elif engine == "ssg": + from pythainlp.tokenize.ssg import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. It might be a typo; if not, please consult our document.""" ) - segments = segment(text) + if segments == []: + segments = segment(text) if not keep_whitespace: segments = [token.strip(" ") for token in segments if token.strip(" ")] @@ -374,6 +390,8 @@ def syllable_tokenize( """ Syllable tokenizer. + **syllable_tokenize is deprecated, use subword_tokenize instead** + Tokenizes text into syllable (Thai: พยางค์), a unit of pronunciation having one vowel sound. For example, the word 'รถไฟ' contains two syallbles including 'รถ', and 'ไฟ'. @@ -403,6 +421,11 @@ def syllable_tokenize( ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว', 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ + warnings.warn( + """syllable_tokenize will be deprecated in PyThaiNLP version 2.4, + use subword_tokenize instead""", + PendingDeprecationWarning + ) if not text or not isinstance(text, str): return [] diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index d163238ce..398a3f322 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -300,6 +300,24 @@ def test_subword_tokenize(self): self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) + self.assertEqual( + subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"] + ) + self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")) + self.assertEqual(subword_tokenize(None, engine="ssg"), []) + self.assertEqual(syllable_tokenize("", engine="ssg"), []) + self.assertEqual( + subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] + ) + self.assertTrue( + "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist