Skip to content

Commit 2606f85

Browse files
authored
Merge pull request #723 from noppayut/fix/broken-numeric-data-format
Fix/broken numeric data format (#652)
2 parents 39b814a + c0e48e9 commit 2606f85

File tree

3 files changed

+301
-91
lines changed

3 files changed

+301
-91
lines changed

pythainlp/tokenize/_utils.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Utility functions for tokenize module.
4+
"""
5+
6+
import re
7+
from typing import List, Callable
8+
9+
_DIGITS_WITH_SEPARATOR = re.compile(r"(\d+[\.\,:])+\d+")
10+
11+
12+
def apply_postprocessors(
13+
segments: List[str], postprocessors: Callable[[List[str]], List[str]]
14+
) -> List[str]:
15+
"""
16+
A list of callables to apply on a raw segmentation result.
17+
"""
18+
for func in postprocessors:
19+
segments = func(segments)
20+
21+
return segments
22+
23+
24+
def rejoin_formatted_num(segments: List[str]) -> List[str]:
25+
"""
26+
Rejoin well-known formatted numeric that are over-tokenized.
27+
The formatted numeric are numbers separated by ":", ",", or ".",
28+
such as time, decimal number, comma-added number, and IP address.
29+
30+
:param List[str] segments: result from word tokenizer
31+
:return: a list of fixed tokens
32+
:rtype: List[str]
33+
34+
:Example:
35+
tokens = ['ขณะ', 'นี้', 'เวลา', ' ', '12', ':', '00น', ' ', 'อัตรา',
36+
'แลกเปลี่ยน', ' ', '1', ',', '234', '.', '5', ' ', 'baht/zeny']
37+
rejoin_formatted_num(tokens)
38+
# output:
39+
# ['ขณะ', 'นี้', 'เวลา', ' ', '12:00น', ' ', 'อัตรา', 'แลกเปลี่ยน', ' ', '1,234.5', ' ', 'baht/zeny']
40+
41+
tokens = ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127', '.', '0', '.', '0', '.', '1', ' ', 'ครับ']
42+
rejoin_formatted_num(tokens)
43+
# output:
44+
# ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127.0.0.1', ' ', 'ครับ']
45+
"""
46+
original = "".join(segments)
47+
matching_results = _DIGITS_WITH_SEPARATOR.finditer(original)
48+
tokens_joined = []
49+
pos = 0
50+
segment_idx = 0
51+
52+
match = next(matching_results, None)
53+
while segment_idx < len(segments) and match:
54+
is_span_beginning = pos >= match.start()
55+
token = segments[segment_idx]
56+
if is_span_beginning:
57+
connected_token = ""
58+
while pos < match.end() and segment_idx < len(segments):
59+
connected_token += segments[segment_idx]
60+
pos += len(segments[segment_idx])
61+
segment_idx += 1
62+
63+
tokens_joined.append(connected_token)
64+
match = next(matching_results, None)
65+
else:
66+
tokens_joined.append(token)
67+
segment_idx += 1
68+
pos += len(token)
69+
tokens_joined += segments[segment_idx:]
70+
return tokens_joined
71+
72+
73+
def strip_whitespace(segments: List[str]) -> List[str]:
74+
"""
75+
Strip whitespace(s) off each token and remove whitespace tokens.
76+
:param List[str] segments: result from word tokenizer
77+
:return: a list of tokens
78+
:rtype: List[str]
79+
80+
:Example:
81+
tokens = [" ", "วันนี้ ", "เวลา ", "19.00น"]
82+
strip_whitespace(tokens)
83+
# ["วันนี้", "เวลา", "19.00น"]
84+
85+
"""
86+
segments = [token.strip(" ") for token in segments if token.strip(" ")]
87+
return segments

pythainlp/tokenize/core.py

Lines changed: 80 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
DEFAULT_WORD_DICT_TRIE,
1414
DEFAULT_WORD_TOKENIZE_ENGINE,
1515
)
16+
from pythainlp.tokenize._utils import (
17+
apply_postprocessors,
18+
rejoin_formatted_num,
19+
strip_whitespace,
20+
)
1621
from pythainlp.util.trie import Trie, dict_trie
1722

1823

@@ -47,7 +52,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
4752
return segment(doc)
4853

4954

50-
def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
55+
def word_detokenize(
56+
segments: Union[List[List[str]], List[str]], output: str = "str"
57+
) -> Union[str, List[str]]:
5158
"""
5259
Word detokenizer.
5360
@@ -62,6 +69,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
6269
if isinstance(segments[0], str):
6370
segments = [segments]
6471
from pythainlp import thai_characters
72+
6573
for i, s in enumerate(segments):
6674
_list_sents = []
6775
_add_index = []
@@ -70,7 +78,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
7078
for j, w in enumerate(s):
7179
if j > 0:
7280
# previous word
73-
p_w = s[j-1]
81+
p_w = s[j - 1]
7482
# if w is number or other language and not be space
7583
if (
7684
w[0] not in thai_characters
@@ -88,9 +96,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
8896
if not p_w.isspace():
8997
_list_sents.append(" ")
9098
_mark_index.append(j)
91-
elif w.isspace() and j-1 not in _space_index:
99+
elif w.isspace() and j - 1 not in _space_index:
92100
_space_index.append(j)
93-
elif j-1 in _mark_index:
101+
elif j - 1 in _mark_index:
94102
_list_sents.append(" ")
95103
_list_sents.append(w)
96104
_list_all.append(_list_sents)
@@ -103,14 +111,15 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
103111
for j in i:
104112
_temp += j
105113
_text.append(_temp)
106-
return ' '.join(_text)
114+
return " ".join(_text)
107115

108116

109117
def word_tokenize(
110118
text: str,
111119
custom_dict: Trie = None,
112120
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
113121
keep_whitespace: bool = True,
122+
join_broken_num: bool = True,
114123
) -> List[str]:
115124
"""
116125
Word tokenizer.
@@ -123,37 +132,47 @@ def word_tokenize(
123132
:param bool keep_whitespace: True to keep whitespaces, a common mark
124133
for end of phrase in Thai.
125134
Otherwise, whitespaces are omitted.
135+
:param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
136+
Otherwise, formatted numeric could be wrongly separated.
137+
126138
:return: list of words
127139
:rtype: List[str]
128140
**Options for engine**
129-
* *newmm* (default) - dictionary-based, Maximum Matching +
130-
Thai Character Cluster
131-
* *newmm-safe* - newmm, with a mechanism to help avoid long
132-
processing time for text with continuous ambiguous breaking points
133-
* *mm* or *multi_cut* - dictionary-based, Maximum Matching.
134-
* *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
135-
* *longest* - dictionary-based, Longest Matching
136-
* *icu* - wrapper for ICU (International Components for Unicode,
137-
using PyICU), dictionary-based
138141
* *attacut* - wrapper for
139142
`AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
140143
learning-based approach
141144
* *deepcut* - wrapper for
142145
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
143146
learning-based approach
144-
* *nercut* - Dictionary-based maximal matching word segmentation,
147+
* *icu* - wrapper for a word tokenizer in
148+
`PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
149+
from ICU (International Components for Unicode),
150+
dictionary-based
151+
* *longest* - dictionary-based, longest matching
152+
* *mm* - "multi-cut", dictionary-based, maximum matching
153+
* *nercut* - dictionary-based, maximal matching,
145154
constrained with Thai Character Cluster (TCC) boundaries,
146-
and combining tokens that are parts of the same named-entity.
155+
combining tokens that are parts of the same named-entity
156+
* *newmm* (default) - "new multi-cut",
157+
dictionary-based, maximum matching,
158+
constrained with Thai Character Cluster (TCC) boundaries
159+
* *newmm-safe* - newmm, with a mechanism to avoid long
160+
processing time for text with continuous ambiguous breaking points
161+
* *nlpo3* - wrapper for a word tokenizer in
162+
`nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
163+
newmm adaptation in Rust (2.5x faster)
164+
* *oskut* - wrapper for
165+
`OSKut <https://github.com/mrpeerat/OSKut>`_.,
166+
Out-of-domain StacKed cut for Word Segmentation
147167
* *sefr_cut* - wrapper for
148168
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
169+
Stacked Ensemble Filter and Refine for Word Segmentation
149170
* *tltk* - wrapper for
150171
`TLTK <https://pypi.org/project/tltk/>`_.,
151-
* *oskut* - wrapper for
152-
`OSKut <https://github.com/mrpeerat/OSKut>`_.,
153-
172+
maximum collocation approach
154173
:Note:
155-
- The parameter **custom_dict** can be provided as an argument \
156-
only for *newmm*, *longest*, and *deepcut* engine.
174+
- The **custom_dict** parameter only works for \
175+
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
157176
:Example:
158177
159178
Tokenize text with different tokenizer::
@@ -178,6 +197,19 @@ def word_tokenize(
178197
179198
word_tokenize(text, engine="newmm", keep_whitespace=False)
180199
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
200+
201+
Join broken formatted numeric (e.g. time, decimals, IP address)::
202+
203+
text = "เงิน1,234บาท19:32น 127.0.0.1"
204+
205+
word_tokenize(text, engine="attacut", join_broken_num=False)
206+
# output:
207+
# ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
208+
# '127', '.', '0', '.', '0', '.', '1']
209+
210+
word_tokenize(text, engine="attacut", join_broken_num=True)
211+
# output:
212+
# ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
181213
182214
Tokenize with default and custom dictionary::
183215
@@ -199,8 +231,8 @@ def word_tokenize(
199231
200232
word_tokenize(text, engine="newmm", custom_dict=trie))
201233
# output:
202-
# ['ชินโซ', ' ', 'อาเบะ',
203-
# ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
234+
# ['ชินโซ', ' ', 'อาเบะ', ' ',
235+
# 'เกิด', ' ', '21', ' ', 'กันยายน']
204236
"""
205237
if not text or not isinstance(text, str):
206238
return []
@@ -257,6 +289,7 @@ def word_tokenize(
257289
segments = segment(text)
258290
elif engine == "nlpo3":
259291
from pythainlp.tokenize.nlpo3 import segment
292+
260293
if isinstance(custom_dict, str):
261294
segments = segment(text, custom_dict=custom_dict)
262295
elif not isinstance(custom_dict, str) and custom_dict is not None:
@@ -274,8 +307,14 @@ def word_tokenize(
274307
It might be a typo; if not, please consult our document."""
275308
)
276309

310+
postprocessors = []
311+
if join_broken_num:
312+
postprocessors.append(rejoin_formatted_num)
313+
277314
if not keep_whitespace:
278-
segments = [token.strip(" ") for token in segments if token.strip(" ")]
315+
postprocessors.append(strip_whitespace)
316+
317+
segments = apply_postprocessors(segments, postprocessors)
279318

280319
return segments
281320

@@ -297,12 +336,12 @@ def sent_tokenize(
297336
:rtype: list[str]
298337
**Options for engine**
299338
* *crfcut* - (default) split by CRF trained on TED dataset
339+
* *thaisum* - The implementation of sentence segmentator from \
340+
Nakhun Chumpolsathien, 2020
341+
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
300342
* *whitespace+newline* - split by whitespaces and newline.
301343
* *whitespace* - split by whitespaces. Specifiaclly, with \
302344
:class:`regex` pattern ``r" +"``
303-
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
304-
* *thaisum* - The implementation of sentence segmentator from \
305-
Nakhun Chumpolsathien, 2020
306345
:Example:
307346
308347
Split the text based on *whitespace*::
@@ -364,7 +403,10 @@ def sent_tokenize(
364403

365404
segments = segment(text)
366405
elif engine == "thaisum":
367-
from pythainlp.tokenize.thaisumcut import ThaiSentenceSegmentor as segmentor
406+
from pythainlp.tokenize.thaisumcut import (
407+
ThaiSentenceSegmentor as segmentor,
408+
)
409+
368410
segment = segmentor()
369411
segments = segment.split_into_sentences(text)
370412
else:
@@ -374,7 +416,7 @@ def sent_tokenize(
374416
)
375417

376418
if not keep_whitespace:
377-
segments = [token.strip(" ") for token in segments if token.strip(" ")]
419+
segments = strip_whitespace(segments)
378420

379421
return segments
380422

@@ -405,13 +447,12 @@ def subword_tokenize(
405447
:return: list of subwords
406448
:rtype: list[str]
407449
**Options for engine**
408-
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
409-
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
410-
* *wangchanberta* - SentencePiece from wangchanberta model.
411450
* *dict* - newmm word tokenizer with a syllable dictionary
451+
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
412452
* *ssg* - CRF syllable segmenter for Thai
453+
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
413454
* *tltk* - syllable tokenizer from tltk
414-
455+
* *wangchanberta* - SentencePiece from wangchanberta model
415456
:Example:
416457
417458
Tokenize text into subword based on *tcc*::
@@ -485,7 +526,7 @@ def subword_tokenize(
485526
segments = segment(text)
486527

487528
if not keep_whitespace:
488-
segments = [token.strip(" ") for token in segments if token.strip(" ")]
529+
segments = strip_whitespace(segments)
489530

490531
return segments
491532

@@ -562,6 +603,7 @@ def __init__(
562603
custom_dict: Union[Trie, Iterable[str], str] = None,
563604
engine: str = "newmm",
564605
keep_whitespace: bool = True,
606+
join_broken_num: bool = True,
565607
):
566608
"""
567609
Initialize tokenizer object.
@@ -584,9 +626,11 @@ def __init__(
584626
raise NotImplementedError(
585627
"""
586628
The Tokenizer class is not support %s for custom tokenizer
587-
""" % self.__engine
629+
"""
630+
% self.__engine
588631
)
589632
self.__keep_whitespace = keep_whitespace
633+
self.__join_broken_num = join_broken_num
590634

591635
def word_tokenize(self, text: str) -> List[str]:
592636
"""
@@ -601,6 +645,7 @@ def word_tokenize(self, text: str) -> List[str]:
601645
custom_dict=self.__trie_dict,
602646
engine=self.__engine,
603647
keep_whitespace=self.__keep_whitespace,
648+
join_broken_num=self.__join_broken_num,
604649
)
605650

606651
def set_tokenize_engine(self, engine: str) -> None:

0 commit comments

Comments
 (0)