diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 098be97b1..ced072da4 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -110,6 +110,14 @@ tcc .. autofunction:: pythainlp.tokenize.tcc.tcc .. autofunction:: pythainlp.tokenize.tcc.tcc_pos +tcc+ ++++ +.. automodule:: pythainlp.tokenize.tcc_p + +.. autofunction:: pythainlp.tokenize.tcc_p.segment +.. autofunction:: pythainlp.tokenize.tcc_p.tcc +.. autofunction:: pythainlp.tokenize.tcc_p.tcc_pos + etcc ++++ .. automodule:: pythainlp.tokenize.etcc diff --git a/notebooks/test_tcc.ipynb b/notebooks/test_tcc.ipynb new file mode 100644 index 000000000..ae934979b --- /dev/null +++ b/notebooks/test_tcc.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pythainlp.tokenize import subword_tokenize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**พิสูจน์ได้ค่ะ (TCC paper)**\n", + "\n", + "should be พิ/สูจน์/ได้/ค่ะ" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['พิ', 'สูจน์', 'ได้', 'ค่ะ']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['พิ', 'สูจน์', 'ได้', 'ค่ะ']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc_p\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**เรือน้อยลอยอยู่ (ETCC paper)**\n", + "\n", + "should be เรื/อ/น้/อ/ย/ล/อ/ย/อ/ยู่" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['เรื', 'อ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc_p\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ประสานงานกับลูกค้า (ETCC paper)**\n", + "\n", + "should be ป/ระ/สา/น/งา/น/กั/บ/ลู/ก/ค้า" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กับ', 'ลู', 'ก', 'ค้า']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc_p\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ประกันภัยสัมพันธ์ (ETCC paper)**\n", + "\n", + "should be ป/ระ/กั/น/ภั/ย/สั/ม/พั/น/ธ์" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ป', 'ระ', 'กั', 'น', 'ภั', 'ย', 'สั', 'ม', 'พั', 'นธ์']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ป', 'ระ', 'กัน', 'ภัย', 'สัม', 'พันธ์']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc_p\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ตากลม (ETCC paper)**\n", + "\n", + "should be ตา/ก/ล/ม" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ตา', 'ก', 'ล', 'ม']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ตากลม\",engine=\"tcc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ตา', 'ก', 'ล', 'ม']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subword_tokenize(\"ตากลม\",engine=\"tcc_p\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "48b90c76b600d2ec6cf3e350b23a5df9176e3eef7b22ad90377f14c1de9c1bf6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index e27a6a601..6a7647f14 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -145,6 +145,7 @@ def word_tokenize( * *newmm* (default) - "new multi-cut", dictionary-based, maximum matching, constrained with Thai Character Cluster (TCC) boundaries + with improve the TCC rule that used in newmm. * *newmm-safe* - newmm, with a mechanism to avoid long processing time for text with continuous ambiguous breaking points * *nlpo3* - wrapper for a word tokenizer in @@ -440,6 +441,7 @@ def subword_tokenize( * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) * *ssg* - CRF syllable segmenter for Thai * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) + * *tcc_p* - Thai Character Cluster + improve the rule that used in newmm * *tltk* - syllable tokenizer from tltk * *wangchanberta* - SentencePiece from wangchanberta model :Example: @@ -489,6 +491,8 @@ def subword_tokenize( if engine == "tcc": from pythainlp.tokenize.tcc import segment + elif engine == "tcc_p": + from pythainlp.tokenize.tcc_p import segment elif engine == "etcc": from pythainlp.tokenize.etcc import segment elif engine == "wangchanberta": diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 720ff4dad..0f7db70be 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Dictionary-based maximal matching word segmentation, constrained with -Thai Character Cluster (TCC) boundaries. +Thai Character Cluster (TCC) boundaries with improve the rules. The code is based on the notebooks created by Korakot Chaovavanich, with heuristic graph size limit added to avoid exponential wait time. @@ -20,7 +20,7 @@ from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE from pythainlp.util import Trie -from pythainlp.tokenize.tcc import tcc_pos +from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens _PAT_NONTHAI = re.compile( diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index 2c5a1b199..43719136c 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ The implementation of tokenizer accorinding to Thai Character Clusters (TCCs) rules purposed by `Theeramunkong et al. 2000. \ @@ -15,34 +15,44 @@ _RE_TCC = ( """\ -เc็c -เcctาะ -เccีtยะ -เccีtย(?=[เ-ไก-ฮ]|$) -เcc็c -เcิc์c -เcิtc -เcีtยะ? -เcืtอะ? -เc[ิีุู]tย(?=[เ-ไก-ฮ]|$) -เctา?ะ? -cัtวะ -c[ัื]tc[ุิะ]? +c[ั]([่-๋]c)? +c[ั]([่-๋]c)?k +เc็ck +เcctาะk +เccีtยะk +เccีtย(?=[เ-ไก-ฮ]|$)k +เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k +เcc็ck +เcิc์ck +เcิtck +เcีtยะ?k +เcืtอะk +เcื +เctา?ะ?k +c[ึื]tck +c[ะ-ู]tk c[ิุู]์ -c[ะ-ู]t +cรรc์ c็ -ct[ะาำ]? -แc็c -แcc์ -แctะ -แcc็c -แccc์ -โctะ -[เ-ไ]ct +ct[ะาำ]?k +แc็ck +แcc์k +แctะk +แcc็ck +แccc์k +โctะk +[เ-ไ]ctk +ก็ +อึ +หึ """.replace( + "k","(cc?[d|ิ]?[์])?" + ) + .replace( "c", "[ก-ฮ]" ) .replace("t", "[่-๋]?") + .replace("d","อูอุ".replace("อ", "")) # DSara: lower vowel .split() ) diff --git a/pythainlp/tokenize/tcc_p.py b/pythainlp/tokenize/tcc_p.py new file mode 100644 index 000000000..09fbe9e53 --- /dev/null +++ b/pythainlp/tokenize/tcc_p.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +""" +The implementation of tokenizer accorinding to Thai Character Clusters (TCCs) +rules purposed by `Theeramunkong et al. 2000. \ + `_ +and improve the rule that used in newmm + +Credits: + * TCC: Jakkrit TeCho + * Grammar: Wittawat Jitkrittum (`link to the source file \ + `_) + * Python code: Korakot Chaovavanich +""" +import re +from typing import List, Set + +_RE_TCC = ( + """\ +เc็ck +เcctาะk +เccีtยะk +เccีtย(?=[เ-ไก-ฮ]|$)k +เcc็ck +เcิc์ck +เcิtck +เcีtยะ?k +เcืtอะ?k +เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k +เctา?ะ?k +cัtวะk +c[ัื]tc[ุิะ]?k +c[ิุู]์ +c[ะ-ู]tk +cรรc์ +c็ +ct[ะาำ]?k +ck +แc็c +แcc์ +แctะ +แcc็c +แccc์ +โctะ +[เ-ไ]ct +ก็ +อึ +หึ +""".replace( + "k","(cc?[dิ]?[์])?" + ) + .replace( + "c", "[ก-ฮ]" + ) + .replace("t", "[่-๋]?") + .replace("d","อูอุ".replace("อ", "")) # DSara: lower vowel + .split() +) + +_PAT_TCC = re.compile("|".join(_RE_TCC)) + + +def tcc(text: str) -> str: + """ + TCC generator, generates Thai Character Clusters + + :param str text: text to be tokenized to character clusters + :return: subwords (character clusters) + :rtype: Iterator[str] + """ + if not text or not isinstance(text, str): + return "" + + len_text = len(text) + p = 0 + while p < len_text: + m = _PAT_TCC.match(text[p:]) + if m: + n = m.span()[1] + else: + n = 1 + yield text[p : p + n] + p += n + + +def tcc_pos(text: str) -> Set[int]: + """ + TCC positions + + :param str text: text to be tokenized to character clusters + :return: list of the end position of subwords + :rtype: set[int] + """ + if not text or not isinstance(text, str): + return set() + + p_set = set() + p = 0 + for w in tcc(text): + p += len(w) + p_set.add(p) + + return p_set + + +def segment(text: str) -> List[str]: + """ + Subword segmentation + + :param str text: text to be tokenized to character clusters + :return: list of subwords (character clusters), tokenized from the text + :rtype: list[str] + + """ + + return list(tcc(text)) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index fd65445de..ddd0ea9fb 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -17,6 +17,7 @@ ssg, subword_tokenize, tcc, + tcc_p, word_tokenize, sefr_cut, tltk, @@ -325,6 +326,12 @@ def test_subword_tokenize(self): self.assertFalse( "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") ) + self.assertIsInstance( + subword_tokenize("สวัสดีดาวอังคาร", engine="tcc_p"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc_p") + ) self.assertEqual(subword_tokenize(None, engine="etcc"), []) self.assertEqual(subword_tokenize("", engine="etcc"), []) self.assertIsInstance( @@ -648,9 +655,75 @@ def test_tcc(self): self.assertEqual( tcc.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"] ) + self.assertEqual( + tcc.segment("พิสูจน์ได้ค่ะ"), ['พิ', 'สูจน์', 'ได้', 'ค่ะ'] + ) + self.assertEqual( + tcc.segment("หอมรดกไทย"), ['ห', 'อ', 'ม', 'ร', 'ด', 'ก', 'ไท', 'ย'] + ) + self.assertEqual( + tcc.segment("เรือน้อยลอยอยู่"), ['เรื', 'อ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่'] + ) + # Not implementation + self.assertEqual( + tcc.segment("ประสานงานกับลูกค้า"), ['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า'] + ) + self.assertEqual( + tcc.segment("ประกันภัยสัมพันธ์"), ['ป', 'ระ', 'กั', 'น', 'ภั', 'ย', 'สั', 'ม', 'พั','นธ์'] # It don't look like TCC in ETCC paper + ) + self.assertEqual( + tcc.segment("ตากลม"), ['ตา', 'ก', 'ล', 'ม'] + ) + self.assertEqual( + tcc.segment("เครื่องมือสื่อสารมีหลายชนิด"), + [ + 'เค', + 'รื่อ', + 'ง', + 'มือ', + 'สื่อ', + 'สา', + 'ร', + 'มี', + 'ห', + 'ลา', + 'ย', + 'ช', + 'นิ', + 'ด' + ] + ) self.assertEqual(list(tcc.tcc("")), []) self.assertEqual(tcc.tcc_pos(""), set()) + def test_tcc_p(self): + self.assertEqual(tcc_p.segment(None), []) + self.assertEqual(tcc_p.segment(""), []) + self.assertEqual( + tcc_p.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"] + ) + self.assertEqual( + tcc_p.segment("พิสูจน์ได้ค่ะ"), ['พิ', 'สูจน์', 'ได้', 'ค่ะ'] + ) + self.assertEqual( + tcc_p.segment("หอมรดกไทย"), ['ห', 'อ', 'ม', 'ร', 'ด', 'ก', 'ไท', 'ย'] + ) + self.assertEqual( + tcc_p.segment("เรือน้อยลอยอยู่"), ['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่'] + ) + # Not implementation + # self.assertEqual( + # tcc.segment("ประสานงานกับลูกค้า"), ['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า'] + # ) + # self.assertEqual( + # tcc.segment("ประกันภัยสัมพันธ์"), ['ป', 'ระ', 'กั', 'น', 'ภั', 'ย', 'สั', 'ม', 'พั','น','ธ์'] + # ) + # self.assertEqual( + # tcc.segment("ตากลม"), ['ตา', 'ก', 'ล', 'ม'] + # ) + self.assertEqual(list(tcc_p.tcc("")), []) + self.assertEqual(tcc_p.tcc_pos(""), set()) + def test_sefr_cut(self): self.assertEqual(sefr_cut.segment(None), []) self.assertEqual(sefr_cut.segment(""), [])