13
13
DEFAULT_WORD_DICT_TRIE ,
14
14
DEFAULT_WORD_TOKENIZE_ENGINE ,
15
15
)
16
+ from pythainlp .tokenize ._utils import (
17
+ apply_postprocessors ,
18
+ rejoin_formatted_num ,
19
+ strip_whitespace ,
20
+ )
16
21
from pythainlp .util .trie import Trie , dict_trie
17
22
18
23
@@ -47,7 +52,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
47
52
return segment (doc )
48
53
49
54
50
- def word_detokenize (segments : Union [List [List [str ]], List [str ]], output : str = "str" ) -> Union [str , List [str ]]:
55
+ def word_detokenize (
56
+ segments : Union [List [List [str ]], List [str ]], output : str = "str"
57
+ ) -> Union [str , List [str ]]:
51
58
"""
52
59
Word detokenizer.
53
60
@@ -62,6 +69,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
62
69
if isinstance (segments [0 ], str ):
63
70
segments = [segments ]
64
71
from pythainlp import thai_characters
72
+
65
73
for i , s in enumerate (segments ):
66
74
_list_sents = []
67
75
_add_index = []
@@ -70,7 +78,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
70
78
for j , w in enumerate (s ):
71
79
if j > 0 :
72
80
# previous word
73
- p_w = s [j - 1 ]
81
+ p_w = s [j - 1 ]
74
82
# if w is number or other language and not be space
75
83
if (
76
84
w [0 ] not in thai_characters
@@ -88,9 +96,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
88
96
if not p_w .isspace ():
89
97
_list_sents .append (" " )
90
98
_mark_index .append (j )
91
- elif w .isspace () and j - 1 not in _space_index :
99
+ elif w .isspace () and j - 1 not in _space_index :
92
100
_space_index .append (j )
93
- elif j - 1 in _mark_index :
101
+ elif j - 1 in _mark_index :
94
102
_list_sents .append (" " )
95
103
_list_sents .append (w )
96
104
_list_all .append (_list_sents )
@@ -103,14 +111,15 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
103
111
for j in i :
104
112
_temp += j
105
113
_text .append (_temp )
106
- return ' ' .join (_text )
114
+ return " " .join (_text )
107
115
108
116
109
117
def word_tokenize (
110
118
text : str ,
111
119
custom_dict : Trie = None ,
112
120
engine : str = DEFAULT_WORD_TOKENIZE_ENGINE ,
113
121
keep_whitespace : bool = True ,
122
+ join_broken_num : bool = True ,
114
123
) -> List [str ]:
115
124
"""
116
125
Word tokenizer.
@@ -123,37 +132,47 @@ def word_tokenize(
123
132
:param bool keep_whitespace: True to keep whitespaces, a common mark
124
133
for end of phrase in Thai.
125
134
Otherwise, whitespaces are omitted.
135
+ :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
136
+ Otherwise, formatted numeric could be wrongly separated.
137
+
126
138
:return: list of words
127
139
:rtype: List[str]
128
140
**Options for engine**
129
- * *newmm* (default) - dictionary-based, Maximum Matching +
130
- Thai Character Cluster
131
- * *newmm-safe* - newmm, with a mechanism to help avoid long
132
- processing time for text with continuous ambiguous breaking points
133
- * *mm* or *multi_cut* - dictionary-based, Maximum Matching.
134
- * *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
135
- * *longest* - dictionary-based, Longest Matching
136
- * *icu* - wrapper for ICU (International Components for Unicode,
137
- using PyICU), dictionary-based
138
141
* *attacut* - wrapper for
139
142
`AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
140
143
learning-based approach
141
144
* *deepcut* - wrapper for
142
145
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
143
146
learning-based approach
144
- * *nercut* - Dictionary-based maximal matching word segmentation,
147
+ * *icu* - wrapper for a word tokenizer in
148
+ `PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
149
+ from ICU (International Components for Unicode),
150
+ dictionary-based
151
+ * *longest* - dictionary-based, longest matching
152
+ * *mm* - "multi-cut", dictionary-based, maximum matching
153
+ * *nercut* - dictionary-based, maximal matching,
145
154
constrained with Thai Character Cluster (TCC) boundaries,
146
- and combining tokens that are parts of the same named-entity.
155
+ combining tokens that are parts of the same named-entity
156
+ * *newmm* (default) - "new multi-cut",
157
+ dictionary-based, maximum matching,
158
+ constrained with Thai Character Cluster (TCC) boundaries
159
+ * *newmm-safe* - newmm, with a mechanism to avoid long
160
+ processing time for text with continuous ambiguous breaking points
161
+ * *nlpo3* - wrapper for a word tokenizer in
162
+ `nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
163
+ newmm adaptation in Rust (2.5x faster)
164
+ * *oskut* - wrapper for
165
+ `OSKut <https://github.com/mrpeerat/OSKut>`_.,
166
+ Out-of-domain StacKed cut for Word Segmentation
147
167
* *sefr_cut* - wrapper for
148
168
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
169
+ Stacked Ensemble Filter and Refine for Word Segmentation
149
170
* *tltk* - wrapper for
150
171
`TLTK <https://pypi.org/project/tltk/>`_.,
151
- * *oskut* - wrapper for
152
- `OSKut <https://github.com/mrpeerat/OSKut>`_.,
153
-
172
+ maximum collocation approach
154
173
:Note:
155
- - The parameter **custom_dict** can be provided as an argument \
156
- only for *newmm *, *longest*, and *deepcut* engine .
174
+ - The **custom_dict** parameter only works for \
175
+ *deepcut *, *longest*, *newmm*, and *newmm-safe* engines .
157
176
:Example:
158
177
159
178
Tokenize text with different tokenizer::
@@ -178,6 +197,19 @@ def word_tokenize(
178
197
179
198
word_tokenize(text, engine="newmm", keep_whitespace=False)
180
199
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
200
+
201
+ Join broken formatted numeric (e.g. time, decimals, IP address)::
202
+
203
+ text = "เงิน1,234บาท19:32น 127.0.0.1"
204
+
205
+ word_tokenize(text, engine="attacut", join_broken_num=False)
206
+ # output:
207
+ # ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
208
+ # '127', '.', '0', '.', '0', '.', '1']
209
+
210
+ word_tokenize(text, engine="attacut", join_broken_num=True)
211
+ # output:
212
+ # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
181
213
182
214
Tokenize with default and custom dictionary::
183
215
@@ -199,8 +231,8 @@ def word_tokenize(
199
231
200
232
word_tokenize(text, engine="newmm", custom_dict=trie))
201
233
# output:
202
- # ['ชินโซ', ' ', 'อาเบะ',
203
- # ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
234
+ # ['ชินโซ', ' ', 'อาเบะ', ' ',
235
+ # 'เกิด', ' ', '21', ' ', 'กันยายน']
204
236
"""
205
237
if not text or not isinstance (text , str ):
206
238
return []
@@ -257,6 +289,7 @@ def word_tokenize(
257
289
segments = segment (text )
258
290
elif engine == "nlpo3" :
259
291
from pythainlp .tokenize .nlpo3 import segment
292
+
260
293
if isinstance (custom_dict , str ):
261
294
segments = segment (text , custom_dict = custom_dict )
262
295
elif not isinstance (custom_dict , str ) and custom_dict is not None :
@@ -274,8 +307,14 @@ def word_tokenize(
274
307
It might be a typo; if not, please consult our document."""
275
308
)
276
309
310
+ postprocessors = []
311
+ if join_broken_num :
312
+ postprocessors .append (rejoin_formatted_num )
313
+
277
314
if not keep_whitespace :
278
- segments = [token .strip (" " ) for token in segments if token .strip (" " )]
315
+ postprocessors .append (strip_whitespace )
316
+
317
+ segments = apply_postprocessors (segments , postprocessors )
279
318
280
319
return segments
281
320
@@ -297,12 +336,12 @@ def sent_tokenize(
297
336
:rtype: list[str]
298
337
**Options for engine**
299
338
* *crfcut* - (default) split by CRF trained on TED dataset
339
+ * *thaisum* - The implementation of sentence segmentator from \
340
+ Nakhun Chumpolsathien, 2020
341
+ * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
300
342
* *whitespace+newline* - split by whitespaces and newline.
301
343
* *whitespace* - split by whitespaces. Specifiaclly, with \
302
344
:class:`regex` pattern ``r" +"``
303
- * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
304
- * *thaisum* - The implementation of sentence segmentator from \
305
- Nakhun Chumpolsathien, 2020
306
345
:Example:
307
346
308
347
Split the text based on *whitespace*::
@@ -364,7 +403,10 @@ def sent_tokenize(
364
403
365
404
segments = segment (text )
366
405
elif engine == "thaisum" :
367
- from pythainlp .tokenize .thaisumcut import ThaiSentenceSegmentor as segmentor
406
+ from pythainlp .tokenize .thaisumcut import (
407
+ ThaiSentenceSegmentor as segmentor ,
408
+ )
409
+
368
410
segment = segmentor ()
369
411
segments = segment .split_into_sentences (text )
370
412
else :
@@ -374,7 +416,7 @@ def sent_tokenize(
374
416
)
375
417
376
418
if not keep_whitespace :
377
- segments = [ token . strip ( " " ) for token in segments if token . strip ( " " )]
419
+ segments = strip_whitespace ( segments )
378
420
379
421
return segments
380
422
@@ -405,13 +447,12 @@ def subword_tokenize(
405
447
:return: list of subwords
406
448
:rtype: list[str]
407
449
**Options for engine**
408
- * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
409
- * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
410
- * *wangchanberta* - SentencePiece from wangchanberta model.
411
450
* *dict* - newmm word tokenizer with a syllable dictionary
451
+ * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
412
452
* *ssg* - CRF syllable segmenter for Thai
453
+ * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
413
454
* *tltk* - syllable tokenizer from tltk
414
-
455
+ * *wangchanberta* - SentencePiece from wangchanberta model
415
456
:Example:
416
457
417
458
Tokenize text into subword based on *tcc*::
@@ -485,7 +526,7 @@ def subword_tokenize(
485
526
segments = segment (text )
486
527
487
528
if not keep_whitespace :
488
- segments = [ token . strip ( " " ) for token in segments if token . strip ( " " )]
529
+ segments = strip_whitespace ( segments )
489
530
490
531
return segments
491
532
@@ -562,6 +603,7 @@ def __init__(
562
603
custom_dict : Union [Trie , Iterable [str ], str ] = None ,
563
604
engine : str = "newmm" ,
564
605
keep_whitespace : bool = True ,
606
+ join_broken_num : bool = True ,
565
607
):
566
608
"""
567
609
Initialize tokenizer object.
@@ -584,9 +626,11 @@ def __init__(
584
626
raise NotImplementedError (
585
627
"""
586
628
The Tokenizer class is not support %s for custom tokenizer
587
- """ % self .__engine
629
+ """
630
+ % self .__engine
588
631
)
589
632
self .__keep_whitespace = keep_whitespace
633
+ self .__join_broken_num = join_broken_num
590
634
591
635
def word_tokenize (self , text : str ) -> List [str ]:
592
636
"""
@@ -601,6 +645,7 @@ def word_tokenize(self, text: str) -> List[str]:
601
645
custom_dict = self .__trie_dict ,
602
646
engine = self .__engine ,
603
647
keep_whitespace = self .__keep_whitespace ,
648
+ join_broken_num = self .__join_broken_num ,
604
649
)
605
650
606
651
def set_tokenize_engine (self , engine : str ) -> None :
0 commit comments