From 3e8501f885f1c80cd669b5447dd96812852826c5 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Thu, 12 Dec 2024 11:39:10 +0700
Subject: [PATCH] Remove clause_tokenize

---
 docs/api/tokenize.rst          |  4 --
 pythainlp/tokenize/__init__.py |  2 -
 pythainlp/tokenize/core.py     | 37 ----------------
 pythainlp/tokenize/crfcls.py   | 77 ----------------------------------
 tests/extra/testx_tokenize.py  |  7 ----
 5 files changed, 127 deletions(-)
 delete mode 100644 pythainlp/tokenize/crfcls.py

diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
index 1f42ab128..41952d748 100644
--- a/docs/api/tokenize.rst
+++ b/docs/api/tokenize.rst
@@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a
 Modules
 -------
 
-.. autofunction:: clause_tokenize
-    :noindex:
-    
-    Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks.
 
 .. autofunction:: sent_tokenize
     :noindex:
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 6ec96955c..083282677 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,7 +9,6 @@
     "THAI2FIT_TOKENIZER",
     "Tokenizer",
     "Trie",
-    "clause_tokenize",
     "paragraph_tokenize",
     "sent_tokenize",
     "subword_tokenize",
@@ -32,7 +31,6 @@
 
 from pythainlp.tokenize.core import (
     Tokenizer,
-    clause_tokenize,
     paragraph_tokenize,
     sent_tokenize,
     subword_tokenize,
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 1c3f54bd0..9ffc873fb 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -25,43 +25,6 @@
 from pythainlp.util.trie import Trie, dict_trie
 
 
-def clause_tokenize(doc: List[str]) -> List[List[str]]:
-    """
-    Clause tokenizer. (or Clause segmentation)
-    Tokenizes running word list into list of clauses (list of strings).
-    Split by CRF trained on Blackboard Treebank.
-
-    :param str doc: word list to be clause tokenized
-    :return: list of clauses
-    :rtype: list[list[str]]
-    :Example:
-    ::
-
-        from pythainlp.tokenize import clause_tokenize
-
-        clause_tokenize(
-            [
-                "ฉัน",
-                "นอน",
-                "และ",
-                "คุณ",
-                "เล่น",
-                "มือถือ",
-                "ส่วน",
-                "น้อง",
-                "เขียน",
-                "โปรแกรม",
-            ]
-        )
-        # [['ฉัน', 'นอน'],
-        # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
-        # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
-    """
-    from pythainlp.tokenize.crfcls import segment
-
-    return segment(doc)
-
-
 def word_detokenize(
     segments: Union[List[List[str]], List[str]], output: str = "str"
 ) -> Union[List[str], str]:
diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py
deleted file mode 100644
index 00b4ad041..000000000
--- a/pythainlp/tokenize/crfcls.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# -*- coding: utf-8 -*-
-# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
-# SPDX-License-Identifier: Apache-2.0
-"""
-Clause segmenter
-"""
-from typing import List
-
-import pycrfsuite
-
-from pythainlp.corpus import path_pythainlp_corpus
-from pythainlp.tag import pos_tag
-
-
-def _doc2features(doc, i):
-    # features from current word
-    curr_word = doc[i][0]
-    curr_pos = doc[i][1]
-    features = {
-        "word.curr_word": curr_word,
-        "word.curr_isspace": curr_word.isspace(),
-        "word.curr_isdigit": curr_word.isdigit(),
-        "word.curr_postag": curr_pos,
-    }
-
-    # features from previous word
-    if i > 0:
-        prev_word = doc[i - 1][0]
-        prev_pos = doc[i - 1][1]
-        features["word.prev_word"] = prev_word
-        features["word.prev_isspace"] = prev_word.isspace()
-        features["word.prev_isdigit"] = prev_word.isdigit()
-        features["word.prev_postag"] = prev_pos
-    else:
-        features["BOS"] = True  # Beginning of Sequence
-
-    # features from next word
-    if i < len(doc) - 1:
-        next_word = doc[i + 1][0]
-        next_pos = doc[i + 1][1]
-        features["word.next_word"] = next_word
-        features["word.next_isspace"] = next_word.isspace()
-        features["word.next_isdigit"] = next_word.isdigit()
-        features["word.next_postag"] = next_pos
-    else:
-        features["EOS"] = True  # End of Sequence
-
-    return features
-
-
-def _extract_features(doc):
-    return [_doc2features(doc, i) for i in range(len(doc))]
-
-
-_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"
-tagger = pycrfsuite.Tagger()
-tagger.open(path_pythainlp_corpus(_CORPUS_NAME))
-
-
-def segment(doc: List[str]) -> List[List[str]]:
-    word_tags = pos_tag(doc, corpus="blackboard")
-    features = _extract_features(word_tags)
-    word_markers = list(zip(doc, tagger.tag(features)))
-
-    clauses = []
-    temp = []
-    len_doc = len(doc) - 1
-    for i, word_marker in enumerate(word_markers):
-        word, marker = word_marker
-        if marker == "E_CLS" or i == len_doc:
-            temp.append(word)
-            clauses.append(temp)
-            temp = []
-        else:
-            temp.append(word)
-
-    return clauses
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
index 1fd721c8e..607b0d7fe 100644
--- a/tests/extra/testx_tokenize.py
+++ b/tests/extra/testx_tokenize.py
@@ -20,7 +20,6 @@
     tltk,
     word_tokenize,
 )
-from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 
 from ..core.test_tokenize import (
     SENT_1,
@@ -31,12 +30,6 @@
 )
 
 
-class ClauseTokenizeTestCase(unittest.TestCase):
-    def test_clause_tokenize(self):
-        self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
-        self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)
-
-
 class DetokenizeTestCase(unittest.TestCase):
     def test_numeric_data_format(self):
         engines = ["attacut", "deepcut", "sefr_cut"]