Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
import re
from typing import Iterable, List, Union
import warnings

from pythainlp.tokenize import (
DEFAULT_SENT_TOKENIZE_ENGINE,
Expand Down Expand Up @@ -302,6 +303,8 @@ def subword_tokenize(
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *wangchanberta* - SentencePiece from wangchanberta model.
* *dict* - newmm word tokenizer with a syllable dictionary
* *ssg* - CRF syllable segmenter for Thai

:Example:

Expand Down Expand Up @@ -346,19 +349,32 @@ def subword_tokenize(
if not text or not isinstance(text, str):
return []

segments = []

if engine == "tcc":
from pythainlp.tokenize.tcc import segment
elif engine == "etcc":
from pythainlp.tokenize.etcc import segment
elif engine == "wangchanberta":
from pythainlp.wangchanberta import segment
elif engine == "dict": # use syllable dictionary
words = word_tokenize(text)
for word in words:
segments.extend(
word_tokenize(
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
)
)
elif engine == "ssg":
from pythainlp.tokenize.ssg import segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)

segments = segment(text)
if segments == []:
segments = segment(text)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
Expand All @@ -374,6 +390,8 @@ def syllable_tokenize(
"""
Syllable tokenizer.

**syllable_tokenize is deprecated, use subword_tokenize instead**

Tokenizes text into syllable (Thai: พยางค์), a unit of
pronunciation having one vowel sound. For example, the word 'รถไฟ'
contains two syallbles including 'รถ', and 'ไฟ'.
Expand Down Expand Up @@ -403,6 +421,11 @@ def syllable_tokenize(
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
"""
warnings.warn(
"""syllable_tokenize will be deprecated in PyThaiNLP version 2.4,
use subword_tokenize instead""",
PendingDeprecationWarning
)

if not text or not isinstance(text, str):
return []
Expand Down
18 changes: 18 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,24 @@ def test_subword_tokenize(self):
self.assertFalse(
" " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
)
self.assertEqual(
subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"]
)
self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
self.assertEqual(
subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
)
self.assertTrue(
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
)
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
)
self.assertFalse(
" " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
)
with self.assertRaises(ValueError):
subword_tokenize("นกแก้ว", engine="XX") # engine does not exist

Expand Down