From 3e8501f885f1c80cd669b5447dd96812852826c5 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Thu, 12 Dec 2024 11:39:10 +0700 Subject: [PATCH] Remove clause_tokenize --- docs/api/tokenize.rst | 4 -- pythainlp/tokenize/__init__.py | 2 - pythainlp/tokenize/core.py | 37 ---------------- pythainlp/tokenize/crfcls.py | 77 ---------------------------------- tests/extra/testx_tokenize.py | 7 ---- 5 files changed, 127 deletions(-) delete mode 100644 pythainlp/tokenize/crfcls.py diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 1f42ab128..41952d748 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a Modules ------- -.. autofunction:: clause_tokenize - :noindex: - - Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks. .. autofunction:: sent_tokenize :noindex: diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 6ec96955c..083282677 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -9,7 +9,6 @@ "THAI2FIT_TOKENIZER", "Tokenizer", "Trie", - "clause_tokenize", "paragraph_tokenize", "sent_tokenize", "subword_tokenize", @@ -32,7 +31,6 @@ from pythainlp.tokenize.core import ( Tokenizer, - clause_tokenize, paragraph_tokenize, sent_tokenize, subword_tokenize, diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1c3f54bd0..9ffc873fb 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -25,43 +25,6 @@ from pythainlp.util.trie import Trie, dict_trie -def clause_tokenize(doc: List[str]) -> List[List[str]]: - """ - Clause tokenizer. (or Clause segmentation) - Tokenizes running word list into list of clauses (list of strings). - Split by CRF trained on Blackboard Treebank. - - :param str doc: word list to be clause tokenized - :return: list of clauses - :rtype: list[list[str]] - :Example: - :: - - from pythainlp.tokenize import clause_tokenize - - clause_tokenize( - [ - "ฉัน", - "นอน", - "และ", - "คุณ", - "เล่น", - "มือถือ", - "ส่วน", - "น้อง", - "เขียน", - "โปรแกรม", - ] - ) - # [['ฉัน', 'นอน'], - # ['และ', 'คุณ', 'เล่น', 'มือถือ'], - # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']] - """ - from pythainlp.tokenize.crfcls import segment - - return segment(doc) - - def word_detokenize( segments: Union[List[List[str]], List[str]], output: str = "str" ) -> Union[List[str], str]: diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py deleted file mode 100644 index 00b4ad041..000000000 --- a/pythainlp/tokenize/crfcls.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 -""" -Clause segmenter -""" -from typing import List - -import pycrfsuite - -from pythainlp.corpus import path_pythainlp_corpus -from pythainlp.tag import pos_tag - - -def _doc2features(doc, i): - # features from current word - curr_word = doc[i][0] - curr_pos = doc[i][1] - features = { - "word.curr_word": curr_word, - "word.curr_isspace": curr_word.isspace(), - "word.curr_isdigit": curr_word.isdigit(), - "word.curr_postag": curr_pos, - } - - # features from previous word - if i > 0: - prev_word = doc[i - 1][0] - prev_pos = doc[i - 1][1] - features["word.prev_word"] = prev_word - features["word.prev_isspace"] = prev_word.isspace() - features["word.prev_isdigit"] = prev_word.isdigit() - features["word.prev_postag"] = prev_pos - else: - features["BOS"] = True # Beginning of Sequence - - # features from next word - if i < len(doc) - 1: - next_word = doc[i + 1][0] - next_pos = doc[i + 1][1] - features["word.next_word"] = next_word - features["word.next_isspace"] = next_word.isspace() - features["word.next_isdigit"] = next_word.isdigit() - features["word.next_postag"] = next_pos - else: - features["EOS"] = True # End of Sequence - - return features - - -def _extract_features(doc): - return [_doc2features(doc, i) for i in range(len(doc))] - - -_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite" -tagger = pycrfsuite.Tagger() -tagger.open(path_pythainlp_corpus(_CORPUS_NAME)) - - -def segment(doc: List[str]) -> List[List[str]]: - word_tags = pos_tag(doc, corpus="blackboard") - features = _extract_features(word_tags) - word_markers = list(zip(doc, tagger.tag(features))) - - clauses = [] - temp = [] - len_doc = len(doc) - 1 - for i, word_marker in enumerate(word_markers): - word, marker = word_marker - if marker == "E_CLS" or i == len_doc: - temp.append(word) - clauses.append(temp) - temp = [] - else: - temp.append(word) - - return clauses diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index 1fd721c8e..607b0d7fe 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -20,7 +20,6 @@ tltk, word_tokenize, ) -from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize from ..core.test_tokenize import ( SENT_1, @@ -31,12 +30,6 @@ ) -class ClauseTokenizeTestCase(unittest.TestCase): - def test_clause_tokenize(self): - self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"])) - self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list) - - class DetokenizeTestCase(unittest.TestCase): def test_numeric_data_format(self): engines = ["attacut", "deepcut", "sefr_cut"]