Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added pythainlp/corpus/han_solo.crfsuite
Binary file not shown.
11 changes: 9 additions & 2 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,10 +530,15 @@ def subword_tokenize(
**Options for engine**
* *dict* - newmm word tokenizer with a syllable dictionary
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *ssg* - CRF syllable segmenter for Thai
* *han_solo* - CRF syllable segmenter for Thai that can work in the \
Thai social media domain. See `PyThaiNLP/Han-solo \
<https://github.com/PyThaiNLP/Han-solo>`_.
* *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
<https://github.com/ponrawee/ssg>`_.
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *tcc_p* - Thai Character Cluster + improve the rule that used in newmm
* *tltk* - syllable tokenizer from tltk
* *tltk* - syllable tokenizer from tltk. See `tltk \
<https://pypi.org/project/tltk/>`_.
* *wangchanberta* - SentencePiece from wangchanberta model
:Example:

Expand Down Expand Up @@ -600,6 +605,8 @@ def subword_tokenize(
from pythainlp.tokenize.ssg import segment
elif engine == "tltk":
from pythainlp.tokenize.tltk import syllable_tokenize as segment
elif engine == "han_solo":
from pythainlp.tokenize.han_solo import segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down
144 changes: 144 additions & 0 deletions pythainlp/tokenize/han_solo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
🪿 Han-solo: Thai syllable segmenter
GitHub: https://github.com/PyThaiNLP/Han-solo
"""
from typing import List
from pythainlp.corpus import path_pythainlp_corpus
try:
import pycrfsuite
except ImportError:
raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")

tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))


class Featurizer:
# This class from ssg at https://github.com/ponrawee/ssg.
# Copyright 2019 Ponrawee Prasertsom

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# {
# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1
# }

def __init__(self, N=2, sequence_size=1, delimiter=None):
self.N = N
self.delimiter = delimiter
self.radius = N + sequence_size
pass

def pad(self, sentence, padder='#'):
return padder * (self.radius) + sentence + padder * (self.radius)

def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'):
if padding:
sentence = self.pad(sentence)
all_features = []
all_labels = []
skip_next = False
for current_position in range(self.radius, len(sentence) - self.radius + 1):
if skip_next:
skip_next = False
continue
features = {}
if return_type == 'list':
features = []
cut = 0
char = sentence[current_position]
if char == self.delimiter:
cut = 1
skip_next = True
counter = 0
chars_left = ''
chars_right = ''
chars = ''
abs_index_left = current_position # left start at -1
abs_index_right = current_position - 1 # right start at 0
while counter < self.radius:
abs_index_left -= 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5)
char_left = sentence[abs_index_left]
while char_left == self.delimiter:
abs_index_left -= 1
char_left = sentence[abs_index_left]
relative_index_left = -counter - 1
# เก็บตัวหนังสือ
chars_left = char_left + chars_left
# ใส่ลง feature
if indiv_char:
left_key = '|'.join([str(relative_index_left), char_left])
if return_type == 'dict':
features[left_key] = 1
else:
features.append(left_key)

abs_index_right += 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
char_right = sentence[abs_index_right]
while char_right == self.delimiter:
abs_index_right += 1
char_right = sentence[abs_index_right]
relative_index_right = counter
chars_right += char_right
if indiv_char:
right_key = '|'.join([str(relative_index_right), char_right])
if return_type == 'dict':
features[right_key] = 1
else:
features.append(right_key)

counter += 1

chars = chars_left + chars_right
for i in range(0, len(chars) - self.N + 1):
ngram = chars[i:i + self.N]
ngram_key = '|'.join([str(i - self.radius), ngram])
if return_type == 'dict':
features[ngram_key] = 1
else:
features.append(ngram_key)
all_features.append(features)
if(return_type == 'list'):
cut = str(cut)
all_labels.append(cut)

return {
'X': all_features,
'Y': all_labels
}
_to_feature = Featurizer()


def segment(text: str) -> List[str]:
x=_to_feature.featurize(text)["X"]
y_pred = tagger.tag(x)
list_cut = []
for j,k in zip(list(text),y_pred):
if k=="1":
list_cut.append(j)
else:
list_cut[-1]+=j
return list_cut
10 changes: 10 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def test_subword_tokenize(self):
"า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")
)
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
self.assertEqual(
subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
)
Expand All @@ -408,6 +409,15 @@ def test_subword_tokenize(self):
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
)
self.assertEqual(
subword_tokenize("แมวกินปลา", engine="han_solo"), ["แมว", "กิน", "ปลา"]
)
self.assertTrue(
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
)
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
)
self.assertFalse(
" " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
)
Expand Down