PyThaiNLP · wannaphong · Oct 23, 2022 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -110,6 +110,14 @@ tcc
 .. autofunction:: pythainlp.tokenize.tcc.tcc
 .. autofunction:: pythainlp.tokenize.tcc.tcc_pos
 
+tcc+
++++
+.. automodule:: pythainlp.tokenize.tcc_p
+
+.. autofunction:: pythainlp.tokenize.tcc_p.segment
+.. autofunction:: pythainlp.tokenize.tcc_p.tcc
+.. autofunction:: pythainlp.tokenize.tcc_p.tcc_pos
+
 etcc
 ++++
 .. automodule:: pythainlp.tokenize.etcc

diff --git a/notebooks/test_tcc.ipynb b/notebooks/test_tcc.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pythainlp.tokenize import subword_tokenize"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**พิสูจน์ได้ค่ะ (TCC paper)**\n",
+    "\n",
+    "should be พิ/สูจน์/ได้/ค่ะ"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['พิ', 'สูจน์', 'ได้', 'ค่ะ']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['พิ', 'สูจน์', 'ได้', 'ค่ะ']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc_p\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**เรือน้อยลอยอยู่ (ETCC paper)**\n",
+    "\n",
+    "should be เรื/อ/น้/อ/ย/ล/อ/ย/อ/ยู่"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['เรื', 'อ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc_p\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**ประสานงานกับลูกค้า (ETCC paper)**\n",
+    "\n",
+    "should be ป/ระ/สา/น/งา/น/กั/บ/ลู/ก/ค้า"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กับ', 'ลู', 'ก', 'ค้า']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc_p\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**ประกันภัยสัมพันธ์ (ETCC paper)**\n",
+    "\n",
+    "should be ป/ระ/กั/น/ภั/ย/สั/ม/พั/น/ธ์"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ป', 'ระ', 'กั', 'น', 'ภั', 'ย', 'สั', 'ม', 'พั', 'นธ์']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ป', 'ระ', 'กัน', 'ภัย', 'สัม', 'พันธ์']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc_p\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**ตากลม (ETCC paper)**\n",
+    "\n",
+    "should be ตา/ก/ล/ม"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ตา', 'ก', 'ล', 'ม']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ตากลม\",engine=\"tcc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ตา', 'ก', 'ล', 'ม']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword_tokenize(\"ตากลม\",engine=\"tcc_p\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "48b90c76b600d2ec6cf3e350b23a5df9176e3eef7b22ad90377f14c1de9c1bf6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -145,6 +145,7 @@ def word_tokenize(
         * *newmm* (default) - "new multi-cut",
           dictionary-based, maximum matching,
           constrained with Thai Character Cluster (TCC) boundaries
+          with improve the TCC rule that used in newmm.
         * *newmm-safe* - newmm, with a mechanism to avoid long
           processing time for text with continuous ambiguous breaking points
         * *nlpo3* - wrapper for a word tokenizer in
@@ -440,6 +441,7 @@ def subword_tokenize(
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
         * *ssg* - CRF syllable segmenter for Thai
         * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
+        * *tcc_p* - Thai Character Cluster + improve the rule that used in newmm
         * *tltk* - syllable tokenizer from tltk
         * *wangchanberta* - SentencePiece from wangchanberta model
     :Example:
@@ -489,6 +491,8 @@ def subword_tokenize(
 
     if engine == "tcc":
         from pythainlp.tokenize.tcc import segment
+    elif engine == "tcc_p":
+        from pythainlp.tokenize.tcc_p import segment
     elif engine == "etcc":
         from pythainlp.tokenize.etcc import segment
     elif engine == "wangchanberta":

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Dictionary-based maximal matching word segmentation, constrained with
-Thai Character Cluster (TCC) boundaries.
+Thai Character Cluster (TCC) boundaries with improve the rules.
 
 The code is based on the notebooks created by Korakot Chaovavanich,
 with heuristic graph size limit added to avoid exponential wait time.
@@ -20,7 +20,7 @@
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
 from pythainlp.util import Trie
 
-from pythainlp.tokenize.tcc import tcc_pos
+from pythainlp.tokenize.tcc_p import tcc_pos
 
 # match non-Thai tokens
 _PAT_NONTHAI = re.compile(