diff --git a/pythainlp/corpus/han_solo.crfsuite b/pythainlp/corpus/han_solo.crfsuite new file mode 100644 index 000000000..98fe82ee7 Binary files /dev/null and b/pythainlp/corpus/han_solo.crfsuite differ diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index a476d22f4..d72462672 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -530,10 +530,15 @@ def subword_tokenize( **Options for engine** * *dict* - newmm word tokenizer with a syllable dictionary * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) - * *ssg* - CRF syllable segmenter for Thai + * *han_solo* - CRF syllable segmenter for Thai that can work in the \ + Thai social media domain. See `PyThaiNLP/Han-solo \ + `_. + * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \ + `_. * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *tcc_p* - Thai Character Cluster + improve the rule that used in newmm - * *tltk* - syllable tokenizer from tltk + * *tltk* - syllable tokenizer from tltk. See `tltk \ + `_. * *wangchanberta* - SentencePiece from wangchanberta model :Example: @@ -600,6 +605,8 @@ def subword_tokenize( from pythainlp.tokenize.ssg import segment elif engine == "tltk": from pythainlp.tokenize.tltk import syllable_tokenize as segment + elif engine == "han_solo": + from pythainlp.tokenize.han_solo import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py new file mode 100644 index 000000000..63053eca0 --- /dev/null +++ b/pythainlp/tokenize/han_solo.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ðŸŠŋ Han-solo: Thai syllable segmenter +GitHub: https://github.com/PyThaiNLP/Han-solo +""" +from typing import List +from pythainlp.corpus import path_pythainlp_corpus +try: + import pycrfsuite +except ImportError: + raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite") + +tagger = pycrfsuite.Tagger() +tagger.open(path_pythainlp_corpus('han_solo.crfsuite')) + + +class Featurizer: +# This class from ssg at https://github.com/ponrawee/ssg. +# Copyright 2019 Ponrawee Prasertsom + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# { +# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1 +# } + + def __init__(self, N=2, sequence_size=1, delimiter=None): + self.N = N + self.delimiter = delimiter + self.radius = N + sequence_size + pass + + def pad(self, sentence, padder='#'): + return padder * (self.radius) + sentence + padder * (self.radius) + + def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'): + if padding: + sentence = self.pad(sentence) + all_features = [] + all_labels = [] + skip_next = False + for current_position in range(self.radius, len(sentence) - self.radius + 1): + if skip_next: + skip_next = False + continue + features = {} + if return_type == 'list': + features = [] + cut = 0 + char = sentence[current_position] + if char == self.delimiter: + cut = 1 + skip_next = True + counter = 0 + chars_left = '' + chars_right = '' + chars = '' + abs_index_left = current_position # left start at -1 + abs_index_right = current_position - 1 # right start at 0 + while counter < self.radius: + abs_index_left -= 1 # āļŠāļĄāļĄāļļāļ•āļīāļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ -1, -2, -3, -4, -5 (radius = 5) + char_left = sentence[abs_index_left] + while char_left == self.delimiter: + abs_index_left -= 1 + char_left = sentence[abs_index_left] + relative_index_left = -counter - 1 + # āđ€āļāđ‡āļšāļ•āļąāļ§āļŦāļ™āļąāļ‡āļŠāļ·āļ­ + chars_left = char_left + chars_left + # āđƒāļŠāđˆāļĨāļ‡ feature + if indiv_char: + left_key = '|'.join([str(relative_index_left), char_left]) + if return_type == 'dict': + features[left_key] = 1 + else: + features.append(left_key) + + abs_index_right += 1 # āļŠāļĄāļĄāļļāļ•āļīāļ„āļ·āļ­āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ 0, 1, 2, 3, 4 (radius = 5) + char_right = sentence[abs_index_right] + while char_right == self.delimiter: + abs_index_right += 1 + char_right = sentence[abs_index_right] + relative_index_right = counter + chars_right += char_right + if indiv_char: + right_key = '|'.join([str(relative_index_right), char_right]) + if return_type == 'dict': + features[right_key] = 1 + else: + features.append(right_key) + + counter += 1 + + chars = chars_left + chars_right + for i in range(0, len(chars) - self.N + 1): + ngram = chars[i:i + self.N] + ngram_key = '|'.join([str(i - self.radius), ngram]) + if return_type == 'dict': + features[ngram_key] = 1 + else: + features.append(ngram_key) + all_features.append(features) + if(return_type == 'list'): + cut = str(cut) + all_labels.append(cut) + + return { + 'X': all_features, + 'Y': all_labels + } +_to_feature = Featurizer() + + +def segment(text: str) -> List[str]: + x=_to_feature.featurize(text)["X"] + y_pred = tagger.tag(x) + list_cut = [] + for j,k in zip(list(text),y_pred): + if k=="1": + list_cut.append(j) + else: + list_cut[-1]+=j + return list_cut diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 4659ff08c..80a0daa28 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -399,6 +399,7 @@ def test_subword_tokenize(self): "āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļŠāļēāļ§āđ‚āļĨāļ", engine="dict") ) self.assertEqual(subword_tokenize(None, engine="ssg"), []) + self.assertEqual(subword_tokenize(None, engine="han_solo"), []) self.assertEqual( subword_tokenize("āđāļĄāļ§āļāļīāļ™āļ›āļĨāļē", engine="ssg"), ["āđāļĄāļ§", "āļāļīāļ™", "āļ›āļĨāļē"] ) @@ -408,6 +409,15 @@ def test_subword_tokenize(self): self.assertFalse( "āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="ssg") ) + self.assertEqual( + subword_tokenize("āđāļĄāļ§āļāļīāļ™āļ›āļĨāļē", engine="han_solo"), ["āđāļĄāļ§", "āļāļīāļ™", "āļ›āļĨāļē"] + ) + self.assertTrue( + "āļ”āļēāļ§" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="han_solo") + ) + self.assertFalse( + "āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="han_solo") + ) self.assertFalse( " " in subword_tokenize("āļžāļąāļ™āļ˜āļĄāļīāļ•āļĢ āļŠāļē āļ™āļĄ", keep_whitespace=False) )