44"""
55Generic functions of tokenizers
66"""
7+
78import re
8- import warnings
99from typing import Iterable , List , Union
1010
1111from pythainlp .tokenize import (
2121 rejoin_formatted_num ,
2222 strip_whitespace ,
2323)
24+ from pythainlp .tools import warn_deprecation
2425from pythainlp .util .trie import Trie , dict_trie
2526
2627
@@ -45,13 +46,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
4546 # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
4647 # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
4748 """
49+ warn_deprecation ("pythainlp.util.clause_tokenize" , "" , "5.0.5" , "5.1" )
4850 from pythainlp .tokenize .crfcls import segment
4951
50- warnings .warn (
51- """
52- clause_tokenize is no longer supported \
53- and will be removed in version 5.1.
54- """ , DeprecationWarning )
5552 return segment (doc )
5653
5754
@@ -71,6 +68,7 @@ def word_detokenize(
7168 ::
7269
7370 from pythainlp.tokenize import word_detokenize
71+
7472 print(word_detokenize(["เรา", "เล่น"]))
7573 # output: เราเล่น
7674 """
@@ -299,18 +297,19 @@ def word_tokenize(
299297 segments = segment (text )
300298 elif engine == "nlpo3" :
301299 from pythainlp .tokenize .nlpo3 import segment
300+
302301 # Currently cannot handle custom_dict from inside word_tokenize(),
303302 # due to difference in type.
304- #if isinstance(custom_dict, str):
303+ # if isinstance(custom_dict, str):
305304 # segments = segment(text, custom_dict=custom_dict)
306- #elif not isinstance(custom_dict, str) and not custom_dict:
305+ # elif not isinstance(custom_dict, str) and not custom_dict:
307306 # raise ValueError(
308307 # f"""Tokenizer \"{engine}\":
309308 # custom_dict must be a str.
310309 # It is a dictionary name as assigned with load_dict().
311310 # See pythainlp.tokenize.nlpo3.load_dict()"""
312311 # )
313- #else:
312+ # else:
314313 # segments = segment(text)
315314 segments = segment (text )
316315 else :
0 commit comments