diff --git a/docker_requirements.txt b/docker_requirements.txt index 03f8ce3db..72fe9e02e 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -34,4 +34,5 @@ khanaa==0.0.6 spacy_thai==0.7.1 esupar==1.3.8 ufal.chu-liu-edmonds==1.0.2 +wtpsplit==1.0.1 fastcoref==2.1.6 diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index ced072da4..dcec5dc07 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -10,6 +10,7 @@ Modules .. autofunction:: clause_tokenize .. autofunction:: sent_tokenize +.. autofunction:: paragraph_tokenize .. autofunction:: subword_tokenize .. autofunction:: word_tokenize .. autofunction:: word_detokenize diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 39d7a7151..674153cc7 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -25,6 +25,7 @@ "subword_tokenize", "word_tokenize", "word_detokenize", + "paragraph_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words @@ -46,6 +47,7 @@ subword_tokenize, word_tokenize, word_detokenize, + paragraph_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 2482d08ff..73b98a88a 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -344,6 +344,12 @@ def sent_tokenize( * *thaisum* - The implementation of sentence segmentator from \ Nakhun Chumpolsathien, 2020 * *tltk* - split by `TLTK `_., + * *wtp* - split by `wtpsplitaxe `_., \ + It support many size of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. * *whitespace+newline* - split by whitespaces and newline. * *whitespace* - split by whitespaces. Specifiaclly, with \ :class:`regex` pattern ``r" +"`` @@ -414,6 +420,13 @@ def sent_tokenize( segment = segmentor() segments = segment.split_into_sentences(text) + elif engine.startswith("wtp"): + if "-" not in engine: + _size="mini" + else: + _size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment + segments = segment(text,size=_size,tokenize="sentence") else: raise ValueError( f"""Tokenizer \"{engine}\" not found. @@ -426,6 +439,61 @@ def sent_tokenize( return segments +def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: + """ + Paragraph tokenizer. + + Tokenizes text into paragraph. + + :param str text: text to be tokenized + :param str engine: the name paragraph tokenizer + :return: list of paragraph + :rtype: List[List[str]] + **Options for engine** + * *wtp* - split by `wtpsplitaxe `_., \ + It support many size of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. + + :Example: + + Split the text based on *wtp*:: + + from pythainlp.tokenize import paragraph_tokenize + + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต" + +" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + + paragraph_tokenize(sent) + # output: [ + # ['(1) '], + # [ + # 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ', + # 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ', + # 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ', + # 'ณ ที่นี้' + # ]] + """ + if engine.startswith("wtp"): + if "-" not in engine: + _size="mini" + else: + _size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment + segments = segment(text,size=_size,tokenize="paragraph") + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + return segments + + def subword_tokenize( text: str, engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE, diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py new file mode 100644 index 000000000..20c8a8eb1 --- /dev/null +++ b/pythainlp/tokenize/wtsplit.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation + +GitHub: https://github.com/bminixhofer/wtpsplit +""" +from typing import List +from wtpsplit import WtP + +_MODEL = None +_MODEL_NAME = None + + +def _tokenize( + text:str, + lang_code:str="th", + model:str="wtp-bert-mini", + tokenize:str="sentence" + )-> List[str]: + global _MODEL_NAME,_MODEL + if _MODEL_NAME != model: + _MODEL = WtP(model_name_or_model=model) + _MODEL_NAME = model + if tokenize=="sentence": + return _MODEL.split(text,lang_code=lang_code) + else: # Paragraph + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True + ) + + +def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: + _model_load="" + if size=="tiny": + _model_load="wtp-bert-tiny" + elif size=="base": + _model_load="wtp-canine-s-1l" + elif size=="large": + _model_load="wtp-canine-s-12l" + else: # mini + _model_load="wtp-bert-mini" + return _tokenize(text, model=_model_load,tokenize=tokenize) diff --git a/setup.py b/setup.py index b2ca9e021..10ca6b107 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ "sentencepiece>=0.1.91" ], "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], + "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"], "wordnet": ["nltk>=3.3"], "generate": ["fastai<2.0"], "sefr_cut": ["sefr_cut>=1.1"], @@ -140,6 +141,7 @@ "onnxruntime>=1.10.0", "thai_nner", "wunsen>=0.0.3", + "wtpsplit>=1.0.1", "spacy_thai>=0.7.1", "spacy>=3.0", "fastcoref>=2.1.5", diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 18e4cacbc..4659ff08c 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -23,6 +23,7 @@ tltk, oskut, word_detokenize, + paragraph_tokenize, ) from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize from pythainlp.util import dict_trie @@ -306,6 +307,30 @@ def test_sent_tokenize(self): engine="thaisum", ), ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp", + ), + ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp-tiny", + ), + ) + # self.assertIsNotNone( + # sent_tokenize( + # sent_3, + # engine="wtp-base", + # ), + # ) + # self.assertIsNotNone( + # sent_tokenize( + # sent_3, + # engine="wtp-large", + # ), + # ) self.assertFalse( " " in sent_tokenize( @@ -317,6 +342,17 @@ def test_sent_tokenize(self): with self.assertRaises(ValueError): sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + def test_paragraph_tokenize(self): + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต" + + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + self.assertIsNotNone(paragraph_tokenize(sent)) + with self.assertRaises(ValueError): + paragraph_tokenize(sent, engine="ai2+2thai") + def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) self.assertEqual(subword_tokenize(""), [])