Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ tcc
.. autofunction:: pythainlp.tokenize.tcc.tcc
.. autofunction:: pythainlp.tokenize.tcc.tcc_pos

tcc+
+++
.. automodule:: pythainlp.tokenize.tcc_p

.. autofunction:: pythainlp.tokenize.tcc_p.segment
.. autofunction:: pythainlp.tokenize.tcc_p.tcc
.. autofunction:: pythainlp.tokenize.tcc_p.tcc_pos

etcc
++++
.. automodule:: pythainlp.tokenize.etcc
Expand Down
285 changes: 285 additions & 0 deletions notebooks/test_tcc.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pythainlp.tokenize import subword_tokenize"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**พิสูจน์ได้ค่ะ (TCC paper)**\n",
"\n",
"should be พิ/สูจน์/ได้/ค่ะ"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['พิ', 'สูจน์', 'ได้', 'ค่ะ']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['พิ', 'สูจน์', 'ได้', 'ค่ะ']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"พิสูจน์ได้ค่ะ\",engine=\"tcc_p\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**เรือน้อยลอยอยู่ (ETCC paper)**\n",
"\n",
"should be เรื/อ/น้/อ/ย/ล/อ/ย/อ/ยู่"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['เรื', 'อ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"เรือน้อยลอยอยู่\",engine=\"tcc_p\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**ประสานงานกับลูกค้า (ETCC paper)**\n",
"\n",
"should be ป/ระ/สา/น/งา/น/กั/บ/ลู/ก/ค้า"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กับ', 'ลู', 'ก', 'ค้า']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ประสานงานกับลูกค้า\",engine=\"tcc_p\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**ประกันภัยสัมพันธ์ (ETCC paper)**\n",
"\n",
"should be ป/ระ/กั/น/ภั/ย/สั/ม/พั/น/ธ์"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ป', 'ระ', 'กั', 'น', 'ภั', 'ย', 'สั', 'ม', 'พั', 'นธ์']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ป', 'ระ', 'กัน', 'ภัย', 'สัม', 'พันธ์']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ประกันภัยสัมพันธ์\",engine=\"tcc_p\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**ตากลม (ETCC paper)**\n",
"\n",
"should be ตา/ก/ล/ม"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ตา', 'ก', 'ล', 'ม']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ตากลม\",engine=\"tcc\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ตา', 'ก', 'ล', 'ม']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subword_tokenize(\"ตากลม\",engine=\"tcc_p\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "48b90c76b600d2ec6cf3e350b23a5df9176e3eef7b22ad90377f14c1de9c1bf6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
4 changes: 4 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def word_tokenize(
* *newmm* (default) - "new multi-cut",
dictionary-based, maximum matching,
constrained with Thai Character Cluster (TCC) boundaries
with improve the TCC rule that used in newmm.
* *newmm-safe* - newmm, with a mechanism to avoid long
processing time for text with continuous ambiguous breaking points
* *nlpo3* - wrapper for a word tokenizer in
Expand Down Expand Up @@ -440,6 +441,7 @@ def subword_tokenize(
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *ssg* - CRF syllable segmenter for Thai
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *tcc_p* - Thai Character Cluster + improve the rule that used in newmm
* *tltk* - syllable tokenizer from tltk
* *wangchanberta* - SentencePiece from wangchanberta model
:Example:
Expand Down Expand Up @@ -489,6 +491,8 @@ def subword_tokenize(

if engine == "tcc":
from pythainlp.tokenize.tcc import segment
elif engine == "tcc_p":
from pythainlp.tokenize.tcc_p import segment
elif engine == "etcc":
from pythainlp.tokenize.etcc import segment
elif engine == "wangchanberta":
Expand Down
4 changes: 2 additions & 2 deletions pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""
Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster (TCC) boundaries.
Thai Character Cluster (TCC) boundaries with improve the rules.

The code is based on the notebooks created by Korakot Chaovavanich,
with heuristic graph size limit added to avoid exponential wait time.
Expand All @@ -20,7 +20,7 @@
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
from pythainlp.util import Trie

from pythainlp.tokenize.tcc import tcc_pos
from pythainlp.tokenize.tcc_p import tcc_pos

# match non-Thai tokens
_PAT_NONTHAI = re.compile(
Expand Down
Loading