diff --git a/pythainlp/tokenize/_utils.py b/pythainlp/tokenize/_utils.py new file mode 100644 index 000000000..63d54bf8c --- /dev/null +++ b/pythainlp/tokenize/_utils.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +Utility functions for tokenize module. +""" + +import re +from typing import List, Callable + +_DIGITS_WITH_SEPARATOR = re.compile(r"(\d+[\.\,:])+\d+") + + +def apply_postprocessors( + segments: List[str], postprocessors: Callable[[List[str]], List[str]] +) -> List[str]: + """ + A list of callables to apply on a raw segmentation result. + """ + for func in postprocessors: + segments = func(segments) + + return segments + + +def rejoin_formatted_num(segments: List[str]) -> List[str]: + """ + Rejoin well-known formatted numeric that are over-tokenized. + The formatted numeric are numbers separated by ":", ",", or ".", + such as time, decimal number, comma-added number, and IP address. + + :param List[str] segments: result from word tokenizer + :return: a list of fixed tokens + :rtype: List[str] + + :Example: + tokens = ['ขณะ', 'นี้', 'เวลา', ' ', '12', ':', '00น', ' ', 'อัตรา', + 'แลกเปลี่ยน', ' ', '1', ',', '234', '.', '5', ' ', 'baht/zeny'] + rejoin_formatted_num(tokens) + # output: + # ['ขณะ', 'นี้', 'เวลา', ' ', '12:00น', ' ', 'อัตรา', 'แลกเปลี่ยน', ' ', '1,234.5', ' ', 'baht/zeny'] + + tokens = ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127', '.', '0', '.', '0', '.', '1', ' ', 'ครับ'] + rejoin_formatted_num(tokens) + # output: + # ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127.0.0.1', ' ', 'ครับ'] + """ + original = "".join(segments) + matching_results = _DIGITS_WITH_SEPARATOR.finditer(original) + tokens_joined = [] + pos = 0 + segment_idx = 0 + + match = next(matching_results, None) + while segment_idx < len(segments) and match: + is_span_beginning = pos >= match.start() + token = segments[segment_idx] + if is_span_beginning: + connected_token = "" + while pos < match.end() and segment_idx < len(segments): + connected_token += segments[segment_idx] + pos += len(segments[segment_idx]) + segment_idx += 1 + + tokens_joined.append(connected_token) + match = next(matching_results, None) + else: + tokens_joined.append(token) + segment_idx += 1 + pos += len(token) + tokens_joined += segments[segment_idx:] + return tokens_joined + + +def strip_whitespace(segments: List[str]) -> List[str]: + """ + Strip whitespace(s) off each token and remove whitespace tokens. + :param List[str] segments: result from word tokenizer + :return: a list of tokens + :rtype: List[str] + + :Example: + tokens = [" ", "วันนี้ ", "เวลา ", "19.00น"] + strip_whitespace(tokens) + # ["วันนี้", "เวลา", "19.00น"] + + """ + segments = [token.strip(" ") for token in segments if token.strip(" ")] + return segments diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 6b918be12..2846e7a90 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -13,6 +13,11 @@ DEFAULT_WORD_DICT_TRIE, DEFAULT_WORD_TOKENIZE_ENGINE, ) +from pythainlp.tokenize._utils import ( + apply_postprocessors, + rejoin_formatted_num, + strip_whitespace, +) from pythainlp.util.trie import Trie, dict_trie @@ -47,7 +52,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: return segment(doc) -def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]: +def word_detokenize( + segments: Union[List[List[str]], List[str]], output: str = "str" +) -> Union[str, List[str]]: """ Word detokenizer. @@ -62,6 +69,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = " if isinstance(segments[0], str): segments = [segments] from pythainlp import thai_characters + for i, s in enumerate(segments): _list_sents = [] _add_index = [] @@ -70,7 +78,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = " for j, w in enumerate(s): if j > 0: # previous word - p_w = s[j-1] + p_w = s[j - 1] # if w is number or other language and not be space if ( w[0] not in thai_characters @@ -88,9 +96,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = " if not p_w.isspace(): _list_sents.append(" ") _mark_index.append(j) - elif w.isspace() and j-1 not in _space_index: + elif w.isspace() and j - 1 not in _space_index: _space_index.append(j) - elif j-1 in _mark_index: + elif j - 1 in _mark_index: _list_sents.append(" ") _list_sents.append(w) _list_all.append(_list_sents) @@ -103,7 +111,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = " for j in i: _temp += j _text.append(_temp) - return ' '.join(_text) + return " ".join(_text) def word_tokenize( @@ -111,6 +119,7 @@ def word_tokenize( custom_dict: Trie = None, engine: str = DEFAULT_WORD_TOKENIZE_ENGINE, keep_whitespace: bool = True, + join_broken_num: bool = True, ) -> List[str]: """ Word tokenizer. @@ -123,37 +132,47 @@ def word_tokenize( :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai. Otherwise, whitespaces are omitted. + :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated. + Otherwise, formatted numeric could be wrongly separated. + :return: list of words :rtype: List[str] **Options for engine** - * *newmm* (default) - dictionary-based, Maximum Matching + - Thai Character Cluster - * *newmm-safe* - newmm, with a mechanism to help avoid long - processing time for text with continuous ambiguous breaking points - * *mm* or *multi_cut* - dictionary-based, Maximum Matching. - * *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust. - * *longest* - dictionary-based, Longest Matching - * *icu* - wrapper for ICU (International Components for Unicode, - using PyICU), dictionary-based * *attacut* - wrapper for `AttaCut `_., learning-based approach * *deepcut* - wrapper for `DeepCut `_, learning-based approach - * *nercut* - Dictionary-based maximal matching word segmentation, + * *icu* - wrapper for a word tokenizer in + `PyICU `_., + from ICU (International Components for Unicode), + dictionary-based + * *longest* - dictionary-based, longest matching + * *mm* - "multi-cut", dictionary-based, maximum matching + * *nercut* - dictionary-based, maximal matching, constrained with Thai Character Cluster (TCC) boundaries, - and combining tokens that are parts of the same named-entity. + combining tokens that are parts of the same named-entity + * *newmm* (default) - "new multi-cut", + dictionary-based, maximum matching, + constrained with Thai Character Cluster (TCC) boundaries + * *newmm-safe* - newmm, with a mechanism to avoid long + processing time for text with continuous ambiguous breaking points + * *nlpo3* - wrapper for a word tokenizer in + `nlpO3 `_., + newmm adaptation in Rust (2.5x faster) + * *oskut* - wrapper for + `OSKut `_., + Out-of-domain StacKed cut for Word Segmentation * *sefr_cut* - wrapper for `SEFR CUT `_., + Stacked Ensemble Filter and Refine for Word Segmentation * *tltk* - wrapper for `TLTK `_., - * *oskut* - wrapper for - `OSKut `_., - + maximum collocation approach :Note: - - The parameter **custom_dict** can be provided as an argument \ - only for *newmm*, *longest*, and *deepcut* engine. + - The **custom_dict** parameter only works for \ + *deepcut*, *longest*, *newmm*, and *newmm-safe* engines. :Example: Tokenize text with different tokenizer:: @@ -178,6 +197,19 @@ def word_tokenize( word_tokenize(text, engine="newmm", keep_whitespace=False) # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] + + Join broken formatted numeric (e.g. time, decimals, IP address):: + + text = "เงิน1,234บาท19:32น 127.0.0.1" + + word_tokenize(text, engine="attacut", join_broken_num=False) + # output: + # ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ', + # '127', '.', '0', '.', '0', '.', '1'] + + word_tokenize(text, engine="attacut", join_broken_num=True) + # output: + # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1'] Tokenize with default and custom dictionary:: @@ -199,8 +231,8 @@ def word_tokenize( word_tokenize(text, engine="newmm", custom_dict=trie)) # output: - # ['ชินโซ', ' ', 'อาเบะ', - # ' ', 'เกิด', ' ', '21', ' ', 'กันยายน'] + # ['ชินโซ', ' ', 'อาเบะ', ' ', + # 'เกิด', ' ', '21', ' ', 'กันยายน'] """ if not text or not isinstance(text, str): return [] @@ -257,6 +289,7 @@ def word_tokenize( segments = segment(text) elif engine == "nlpo3": from pythainlp.tokenize.nlpo3 import segment + if isinstance(custom_dict, str): segments = segment(text, custom_dict=custom_dict) elif not isinstance(custom_dict, str) and custom_dict is not None: @@ -274,8 +307,14 @@ def word_tokenize( It might be a typo; if not, please consult our document.""" ) + postprocessors = [] + if join_broken_num: + postprocessors.append(rejoin_formatted_num) + if not keep_whitespace: - segments = [token.strip(" ") for token in segments if token.strip(" ")] + postprocessors.append(strip_whitespace) + + segments = apply_postprocessors(segments, postprocessors) return segments @@ -297,12 +336,12 @@ def sent_tokenize( :rtype: list[str] **Options for engine** * *crfcut* - (default) split by CRF trained on TED dataset + * *thaisum* - The implementation of sentence segmentator from \ + Nakhun Chumpolsathien, 2020 + * *tltk* - split by `TLTK `_., * *whitespace+newline* - split by whitespaces and newline. * *whitespace* - split by whitespaces. Specifiaclly, with \ :class:`regex` pattern ``r" +"`` - * *tltk* - split by `TLTK `_., - * *thaisum* - The implementation of sentence segmentator from \ - Nakhun Chumpolsathien, 2020 :Example: Split the text based on *whitespace*:: @@ -364,7 +403,10 @@ def sent_tokenize( segments = segment(text) elif engine == "thaisum": - from pythainlp.tokenize.thaisumcut import ThaiSentenceSegmentor as segmentor + from pythainlp.tokenize.thaisumcut import ( + ThaiSentenceSegmentor as segmentor, + ) + segment = segmentor() segments = segment.split_into_sentences(text) else: @@ -374,7 +416,7 @@ def sent_tokenize( ) if not keep_whitespace: - segments = [token.strip(" ") for token in segments if token.strip(" ")] + segments = strip_whitespace(segments) return segments @@ -405,13 +447,12 @@ def subword_tokenize( :return: list of subwords :rtype: list[str] **Options for engine** - * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) - * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) - * *wangchanberta* - SentencePiece from wangchanberta model. * *dict* - newmm word tokenizer with a syllable dictionary + * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) * *ssg* - CRF syllable segmenter for Thai + * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *tltk* - syllable tokenizer from tltk - + * *wangchanberta* - SentencePiece from wangchanberta model :Example: Tokenize text into subword based on *tcc*:: @@ -485,7 +526,7 @@ def subword_tokenize( segments = segment(text) if not keep_whitespace: - segments = [token.strip(" ") for token in segments if token.strip(" ")] + segments = strip_whitespace(segments) return segments @@ -562,6 +603,7 @@ def __init__( custom_dict: Union[Trie, Iterable[str], str] = None, engine: str = "newmm", keep_whitespace: bool = True, + join_broken_num: bool = True, ): """ Initialize tokenizer object. @@ -584,9 +626,11 @@ def __init__( raise NotImplementedError( """ The Tokenizer class is not support %s for custom tokenizer - """ % self.__engine + """ + % self.__engine ) self.__keep_whitespace = keep_whitespace + self.__join_broken_num = join_broken_num def word_tokenize(self, text: str) -> List[str]: """ @@ -601,6 +645,7 @@ def word_tokenize(self, text: str) -> List[str]: custom_dict=self.__trie_dict, engine=self.__engine, keep_whitespace=self.__keep_whitespace, + join_broken_num=self.__join_broken_num, ) def set_tokenize_engine(self, engine: str) -> None: diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index c1cf41340..d0999d771 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -242,48 +242,78 @@ def test_sent_tokenize(self): ] self.assertEqual( - sent_tokenize(sent_1, engine="crfcut"), sent_1_toks, + sent_tokenize(sent_1, engine="crfcut"), + sent_1_toks, ) self.assertEqual( - sent_tokenize(sent_2, engine="crfcut"), sent_2_toks, + sent_tokenize(sent_2, engine="crfcut"), + sent_2_toks, ) self.assertEqual( - sent_tokenize(sent_3, engine="crfcut"), sent_3_toks, + sent_tokenize(sent_3, engine="crfcut"), + sent_3_toks, ) self.assertEqual( - sent_tokenize(sent_1), sent_1_toks, + sent_tokenize(sent_1), + sent_1_toks, ) self.assertEqual( - sent_tokenize(sent_2), sent_2_toks, + sent_tokenize(sent_2), + sent_2_toks, ) self.assertEqual( - sent_tokenize(sent_3), sent_3_toks, + sent_tokenize(sent_3), + sent_3_toks, ) self.assertIsNotNone( - sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), + sent_tokenize( + sent_1, + keep_whitespace=False, + engine="whitespace", + ), ) self.assertIsNotNone( - sent_tokenize(sent_1, engine="tltk",), + sent_tokenize( + sent_1, + engine="tltk", + ), ) self.assertIsNotNone( - sent_tokenize(sent_2, engine="tltk",), + sent_tokenize( + sent_2, + engine="tltk", + ), ) self.assertIsNotNone( - sent_tokenize(sent_3, engine="tltk",), + sent_tokenize( + sent_3, + engine="tltk", + ), ) self.assertIsNotNone( - sent_tokenize(sent_1, engine="thaisum",), + sent_tokenize( + sent_1, + engine="thaisum", + ), ) self.assertIsNotNone( - sent_tokenize(sent_2, engine="thaisum",), + sent_tokenize( + sent_2, + engine="thaisum", + ), ) self.assertIsNotNone( - sent_tokenize(sent_3, engine="thaisum",), + sent_tokenize( + sent_3, + engine="thaisum", + ), ) self.assertFalse( " " in sent_tokenize( - sent_1, engine="whitespace", keep_whitespace=False, + sent_1, + engine="whitespace", + keep_whitespace=False, ) ) with self.assertRaises(ValueError): @@ -322,9 +352,12 @@ def test_subword_tokenize(self): " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) self.assertEqual( - subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"] + subword_tokenize("สวัสดีชาวโลก", engine="dict"), + ["สวัส", "ดี", "ชาว", "โลก"], + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีชาวโลก", engine="dict") ) - self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")) self.assertEqual(subword_tokenize(None, engine="ssg"), []) self.assertEqual( subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] @@ -346,9 +379,7 @@ def test_subword_tokenize(self): self.assertFalse( "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk") ) - self.assertIsInstance( - subword_tokenize("โควิด19", engine="tltk"), list - ) + self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list) with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist @@ -436,20 +467,18 @@ def test_tltk(self): self.assertEqual(tltk.segment(None), []) self.assertEqual(tltk.segment(""), []) self.assertEqual( - tltk.syllable_tokenize( - "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" - ), + tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), [ - 'ฉัน', - 'รัก', - 'ภา', - 'ษา', - 'ไทย', - 'เพราะ', - 'ฉัน', - 'เป็น', - 'คน', - 'ไทย' + "ฉัน", + "รัก", + "ภา", + "ษา", + "ไทย", + "เพราะ", + "ฉัน", + "เป็น", + "คน", + "ไทย", ], ) self.assertEqual(tltk.syllable_tokenize(None), []) @@ -471,7 +500,8 @@ def test_longest(self): ["ปวด", "เฉียบพลัน"], ) self.assertEqual( - longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], + longest_tokenizer.word_tokenize("เฉียบพลัน"), + ["เฉียบพลัน"], ) def test_mm(self): @@ -486,15 +516,15 @@ def test_mm(self): ) self.assertEqual( word_tokenize("19...", engine="mm"), - ['19', '...'], + ["19", "..."], ) self.assertEqual( word_tokenize("19.", engine="mm"), - ['19', '.'], + ["19", "."], ) self.assertEqual( word_tokenize("19.84", engine="mm"), - ['19.84'], + ["19.84"], ) self.assertEqual( word_tokenize("127.0.0.1", engine="mm"), @@ -502,7 +532,7 @@ def test_mm(self): ) self.assertEqual( word_tokenize("USD1,984.42", engine="mm"), - ['USD', '1,984.42'], + ["USD", "1,984.42"], ) self.assertIsNotNone(multi_cut.mmcut("ทดสอบ")) @@ -521,15 +551,15 @@ def test_newmm(self): ) self.assertEqual( word_tokenize("19...", engine="newmm"), - ['19', '...'], + ["19", "..."], ) self.assertEqual( word_tokenize("19.", engine="newmm"), - ['19', '.'], + ["19", "."], ) self.assertEqual( word_tokenize("19.84", engine="newmm"), - ['19.84'], + ["19.84"], ) self.assertEqual( word_tokenize("127.0.0.1", engine="newmm"), @@ -537,7 +567,7 @@ def test_newmm(self): ) self.assertEqual( word_tokenize("USD1,984.42", engine="newmm"), - ['USD', '1,984.42'], + ["USD", "1,984.42"], ) self.assertEqual( word_tokenize( @@ -561,7 +591,11 @@ def test_newmm(self): ["จุ๋ม", "ง่วง"], ) self.assertFalse( - " " in word_tokenize("จุ๋มง่วง", keep_whitespace=False,) + " " + in word_tokenize( + "จุ๋มง่วง", + keep_whitespace=False, + ) ) def test_newmm_longtext(self): @@ -596,13 +630,12 @@ def test_nercut(self): self.assertEqual(nercut.segment(None), []) self.assertEqual(nercut.segment(""), []) self.assertIsNotNone(nercut.segment("ทดสอบ")) - self.assertEqual(nercut.segment("ทันแน่ๆ"), ['ทัน', 'แน่ๆ']) - self.assertEqual(nercut.segment("%1ครั้ง"), ['%', '1', 'ครั้ง']) - self.assertEqual(nercut.segment("ทุ๊กกโคนน"), ['ทุ๊กกโคนน']) - self.assertEqual(nercut.segment("อือหือ"), ['อือหือ']) + self.assertEqual(nercut.segment("ทันแน่ๆ"), ["ทัน", "แน่ๆ"]) + self.assertEqual(nercut.segment("%1ครั้ง"), ["%", "1", "ครั้ง"]) + self.assertEqual(nercut.segment("ทุ๊กกโคนน"), ["ทุ๊กกโคนน"]) + self.assertEqual(nercut.segment("อือหือ"), ["อือหือ"]) self.assertEqual( - nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"), - ['อย่าลืมอัพการ์ดนะจ๊ะ'] + nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"), ["อย่าลืมอัพการ์ดนะจ๊ะ"] ) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) @@ -644,29 +677,74 @@ def test_oskut(self): def test_word_detokenize(self): self.assertEqual( - word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), - "ผมเลี้ยง 5 ตัว" + word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" ) - self.assertEqual(word_detokenize( - ["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), - [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]] + self.assertEqual( + word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), + [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]], ) self.assertEqual( word_detokenize( ["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"] ), - "ผมเลี้ยง 5 10 ตัว ๆ คนดี" + "ผมเลี้ยง 5 10 ตัว ๆ คนดี", ) self.assertEqual( word_detokenize( ["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"] ), - "ผมเลี้ยง 5 ตัว ๆ คนดี" + "ผมเลี้ยง 5 ตัว ๆ คนดี", ) self.assertTrue( isinstance(word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), str) ) self.assertEqual( word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]), - "ม่ายย ผมเลี้ยง 5 ตัว" + "ม่ายย ผมเลี้ยง 5 ตัว", + ) + + def test_numeric_data_format(self): + engines = ["attacut", "deepcut", "newmm", "sefr_cut"] + + for engine in engines: + self.assertIn( + "127.0.0.1", + word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), + ) + + tokens = word_tokenize( + "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine + ) + self.assertTrue( + any([value in tokens for value in ["12:12pm", "12:12"]]), + msg=f"{engine}: {tokens}", + ) + self.assertIn("11.11", tokens) + + self.assertIn( + "1,234,567.89", + word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), + ) + + tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) + self.assertIn("2.5:1", tokens) + self.assertIn("5:2", tokens) + + # try turning off `join_broken_num` + engine = "attacut" + self.assertNotIn( + "127.0.0.1", + word_tokenize( + "ไอพีของคุณคือ 127.0.0.1 ครับ", + engine=engine, + join_broken_num=False, + ), + ) + self.assertNotIn( + "1,234,567.89", + word_tokenize( + "รางวัลมูลค่า 1,234,567.89 บาท", + engine=engine, + join_broken_num=False, + ), )