diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 41952d748..5fe02fdc2 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -44,6 +44,8 @@ Modules The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs. +.. autoclass:: display_cell_tokenize + Tokenization Engines -------------------- diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 40a83d369..e9727a351 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -16,6 +16,7 @@ "syllable_tokenize", "word_detokenize", "word_tokenize", + "display_cell_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words @@ -38,6 +39,7 @@ syllable_tokenize, word_detokenize, word_tokenize, + display_cell_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1f8304c42..c3e73649d 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -733,6 +733,46 @@ def syllable_tokenize( ) +def display_cell_tokenize(text: str) -> List[str]: + """ + Display cell tokenizer. + + Tokenizes Thai text into display cells without splitting tone marks. + + :param str text: text to be tokenized + :return: list of display cells + :rtype: List[str] + :Example: + + Tokenize Thai text into display cells:: + + from pythainlp.tokenize import display_cell_tokenize + + text = "แม่น้ำอยู่ที่ไหน" + display_cell_tokenize(text) + # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น'] + """ + if not text or not isinstance(text, str): + return [] + + display_cells = [] + current_cell = "" + text = text.replace("ำ", "ํา") + + for char in text: + if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char): + current_cell += char + else: + if current_cell: + display_cells.append(current_cell) + current_cell = char + + if current_cell: + display_cells.append(current_cell) + + return display_cells + + class Tokenizer: """ Tokenizer class for a custom tokenizer. diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 054a2ab7b..f752a1b94 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -19,6 +19,7 @@ tcc_p, word_detokenize, word_tokenize, + display_cell_tokenize, ) from pythainlp.util import dict_trie @@ -604,3 +605,13 @@ def test_tcc_p(self): # ) self.assertEqual(list(tcc_p.tcc("")), []) self.assertEqual(tcc_p.tcc_pos(""), set()) + + def test_display_cell_tokenize(self): + self.assertEqual(display_cell_tokenize(""), []) + self.assertEqual( + display_cell_tokenize("แม่น้ำอยู่ที่ไหน"), + ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"] + ) + self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี']) + self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"]) + self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])