From 9c3f09fd385c21af142b371a736760b4d340c92d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 13:56:46 +0700 Subject: [PATCH 01/21] Add tltk --- docker_requirements.txt | 3 ++- docs/notes/installation.rst | 1 + pythainlp/spell/core.py | 4 ++++ pythainlp/spell/tltk.py | 5 +++++ pythainlp/tag/pos_tag.py | 23 +++++++++++++++++++++-- pythainlp/tag/tltk.py | 9 +++++++++ pythainlp/tokenize/core.py | 9 +++++++++ pythainlp/tokenize/tltk.py | 18 ++++++++++++++++++ pythainlp/transliterate/core.py | 23 +++++++++++++++++++++-- pythainlp/transliterate/tltk.py | 17 +++++++++++++++++ setup.py | 4 +++- tests/test_spell.py | 8 ++++++++ tests/test_tokenize.py | 12 ++++++++++++ tests/test_transliterate.py | 5 +++++ 14 files changed, 135 insertions(+), 6 deletions(-) create mode 100644 pythainlp/spell/tltk.py create mode 100644 pythainlp/tag/tltk.py create mode 100644 pythainlp/tokenize/tltk.py create mode 100644 pythainlp/transliterate/tltk.py diff --git a/docker_requirements.txt b/docker_requirements.txt index 29b2cab01..7dcc9ca87 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -24,4 +24,5 @@ pyicu==2.6 deepcut==0.7.0.0 h5py==2.10.0 tensorflow==2.4.0 -pandas==0.24 \ No newline at end of file +pandas==0.24 +tltk==1.3.8 \ No newline at end of file diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index d48354d6e..9b5669ae8 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -27,6 +27,7 @@ where ``extras`` can be - ``mt5`` (to mt5 models for Thai text summarizer) - ``wordnet`` (to support wordnet) - ``spell`` (to support phunspell & symspellpy) + - ``tltk`` (to support tltk) - ``full`` (install everything) For dependency details, look at `extras` variable in `setup.py `_. diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py index a749fa61c..ac515b576 100644 --- a/pythainlp/spell/core.py +++ b/pythainlp/spell/core.py @@ -22,6 +22,7 @@ def spell(word: str, engine: str = "pn") -> List[str]: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) * *phunspell* - A spell checker utilizing spylls a port of Hunspell. * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + * *tltk* - wrapper for `TLTK `_., :return: list of possible correct words within 1 or 2 edit distance and sorted by frequency of word occurrences in the spelling dictionary @@ -39,6 +40,9 @@ def spell(word: str, engine: str = "pn") -> List[str]: spell("เส้นตรบ") # output: ['เส้นตรง'] + spell("เส้นตรบ", engine="tltk") + # output: ['เส้นตรง'] + spell("ครัช") # output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส', # 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด'] diff --git a/pythainlp/spell/tltk.py b/pythainlp/spell/tltk.py new file mode 100644 index 000000000..1aabc3f55 --- /dev/null +++ b/pythainlp/spell/tltk.py @@ -0,0 +1,5 @@ +from tltk.nlp import spell_candidates +from typing import List + +def spell(text: str) -> List[str]: + return spell_candidates(text) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 97f1a6d70..ce2338481 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -15,6 +15,8 @@ def pos_tag( * *wangchanberta* - wangchanberta model (support lst20 corpus only \ and it supports a string only. if you input a list of word, \ it will convert list word to a string. + * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ + if you choose other corpus, It's change to TNC corpus.) :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ @@ -28,6 +30,7 @@ def pos_tag( * *pud* - `Parallel Universal Dependencies (PUD)\ `_ \ treebanks, natively use Universal POS tags + * *tnc* - Thai National Corpus (support tltk engine only) :return: a list of tuples (word, POS tag) :rtype: list[tuple[str, str]] @@ -89,13 +92,25 @@ def pos_tag( if not words: return [] - if engine == "perceptron": + _support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"] + + if engine == "perceptron" and corpus in _support_corpus: from pythainlp.tag.perceptron import tag as tag_ elif engine == "wangchanberta" and corpus == "lst20": from pythainlp.wangchanberta.postag import pos_tag as tag_ words = ''.join(words) - else: # default, use "unigram" ("old") engine + elif engine == "tltk": + from pythainlp.tag.tltk import pos_tag as tag_ + corpus = "tnc" + elif engine == "unigram" and corpus in _support_corpus: # default from pythainlp.tag.unigram import tag as tag_ + else: + raise NotImplemented( + "pos_tag not support {0} engine or {1} corpus.".format( + engine, + corpus + ) + ) word_tags = tag_(words, corpus=corpus) @@ -114,6 +129,9 @@ def pos_tag_sents( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger + * *wangchanberta* - wangchanberta model (support lst20 corpus only) + * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ + if you choose other corpus, It's change to TNC corpus.) :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ @@ -127,6 +145,7 @@ def pos_tag_sents( * *pud* - `Parallel Universal Dependencies (PUD)\ `_ \ treebanks, natively use Universal POS tags + * *tnc* - Thai National Corpus (support tltk engine only) :return: a list of lists of tuples (word, POS tag) :rtype: list[list[tuple[str, str]]] diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py new file mode 100644 index 000000000..371075e56 --- /dev/null +++ b/pythainlp/tag/tltk.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +from tltk.nlp import pos_tag_wordlist +from typing import List, Tuple + + +def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: + if corpus != "tnc": + raise NotImplemented("tltk not support {0} corpus.".format(0)) + return pos_tag_wordlist(words) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index c5a501230..1cb2f43b3 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -86,6 +86,8 @@ def word_tokenize( and combining tokens that are parts of the same named-entity. * *sefr_cut* - wrapper for `SEFR CUT `_., + * *tltk* - wrapper for + `TLTK `_., :Note: - The parameter **custom_dict** can be provided as an argument \ @@ -182,6 +184,10 @@ def word_tokenize( elif engine == "sefr_cut": from pythainlp.tokenize.sefr_cut import segment + segments = segment(text) + elif engine == "tltk": + from pythainlp.tokenize.tltk import segment + segments = segment(text) else: raise ValueError( @@ -314,6 +320,7 @@ def subword_tokenize( * *wangchanberta* - SentencePiece from wangchanberta model. * *dict* - newmm word tokenizer with a syllable dictionary * *ssg* - CRF syllable segmenter for Thai + * *tltk* - syllable tokenizer from tltk :Example: @@ -376,6 +383,8 @@ def subword_tokenize( ) elif engine == "ssg": from pythainlp.tokenize.ssg import segment + elif engine == "tltk": + from pythainlp.tokenize.tltk import syllable_tokenize as segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/tokenize/tltk.py b/pythainlp/tokenize/tltk.py new file mode 100644 index 000000000..e5f8da8d1 --- /dev/null +++ b/pythainlp/tokenize/tltk.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from typing import List +from tltk.nlp import word_segment as tltk_segment +from tltk.nlp import syl_segment + + +def segment(text: str) -> List[str]: + if not text or not isinstance(text, str): + return [] + _temp = tltk_segment(text).replace("","").replace(""," ") + return _temp.split('|') + + +def syllable_tokenize(text: str) -> List[str]: + if not text or not isinstance(text, str): + return [] + _temp = syl_segment(text) + return _temp.split('~') diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index 5460eadd7..02c59ead7 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -23,6 +23,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str: Transcription issued by Royal Institute of Thailand. * *thai2rom* - a deep learning-based Thai romanization engine (require PyTorch). + * *tltk* - TLTK: Thai Language Toolkit :Example: :: @@ -35,6 +36,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str: romanize("สามารถ", engine="thai2rom") # output: 'samat' + romanize("สามารถ", engine="tltk") + # output: 'samat' + romanize("ภาพยนตร์", engine="royin") # output: 'phapn' @@ -47,6 +51,8 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str: if engine == "thai2rom": from pythainlp.transliterate.thai2rom import romanize + elif engine == "tltk": + from pythainlp.transliterate.tltk import romanize else: # use default engine "royin" from pythainlp.transliterate.royin import romanize @@ -67,10 +73,13 @@ def transliterate( :rtype: str :Options for engines: - * *icu* - pyicu, based on International Components for Unicode (ICU) - * *ipa* - epitran, output is International Phonetic Alphabet (IPA) * *thaig2p* - (default) Thai Grapheme-to-Phoneme, output is IPA (require PyTorch) + * *icu* - pyicu, based on International Components for Unicode (ICU) + * *ipa* - epitran, output is International Phonetic Alphabet (IPA) + * *tltk_g2p* - Thai Grapheme-to-Phoneme from\ + `TLTK `_., + * *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA) :Example: :: @@ -86,6 +95,12 @@ def transliterate( transliterate("สามารถ", engine="thaig2p") # output: 's aː ˩˩˦ . m aː t̚ ˥˩' + transliterate("สามารถ", engine="tltk_ipa") + # output: 'saː5.maːt3' + + transliterate("สามารถ", engine="tltk_g2p") + # output: 'saa4~maat2' + transliterate("ภาพยนตร์", engine="icu") # output: 'p̣hāphyntr̒' @@ -103,6 +118,10 @@ def transliterate( from pythainlp.transliterate.pyicu import transliterate elif engine == "ipa": from pythainlp.transliterate.ipa import transliterate + elif engine == "tltk_g2p": + from pythainlp.transliterate.tltk import tltk_g2p as transliterate + elif engine == "tltk_ipa": + from pythainlp.transliterate.tltk import tltk_ipa as transliterate else: # use default engine: "thaig2p" from pythainlp.transliterate.thaig2p import transliterate diff --git a/pythainlp/transliterate/tltk.py b/pythainlp/transliterate/tltk.py new file mode 100644 index 000000000..4e4999320 --- /dev/null +++ b/pythainlp/transliterate/tltk.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +from tltk.nlp import g2p, th2ipa, th2roman + + +def romanize(text: str) -> str: + _temp = th2roman(text) + return _temp[:_temp.rfind(" ")].replace("", "") + + +def tltk_g2p(text: str) -> str: + _temp = g2p(text).split("")[1].replace("|","").replace("|", " ") + return _temp.replace("", "") + + +def tltk_ipa(text: str) -> str: + _temp = th2ipa(text) + return _temp[:_temp.rfind(" ")].replace("", "") diff --git a/setup.py b/setup.py index e0597f58b..e7c4fb5ab 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "spylls>=0.1.5", "symspellpy>=6.7.0" ], + "tltk": ["tltk>=1.3.8"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -94,7 +95,8 @@ "sefr_cut>=1.1", "phunspell>=0.1.6", "spylls>=0.1.5", - "symspellpy>=6.7.0" + "symspellpy>=6.7.0", + "tltk>=1.3.8", ], } diff --git a/tests/test_spell.py b/tests/test_spell.py index 2183f5594..e59474a06 100644 --- a/tests/test_spell.py +++ b/tests/test_spell.py @@ -40,6 +40,14 @@ def test_spell(self): self.assertIsInstance(result, list) self.assertGreater(len(result), 0) + result = spell("เน้ร", engine="tltk") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + + result = spell("เกสมร์", engine="tltk") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + def test_word_correct(self): self.assertEqual(correct(None), "") self.assertEqual(correct(""), "") diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index cc90634fc..fd6cc22ae 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -319,6 +319,17 @@ def test_subword_tokenize(self): self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) + self.assertEqual(subword_tokenize(None, engine="tltk"), []) + self.assertEqual(subword_tokenize("", engine="tltk"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="tltk"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk") + ) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="tltk"), list + ) with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist @@ -360,6 +371,7 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) self.assertIsNotNone(word_tokenize(self.text_1, engine="sefr_cut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="tltk")) with self.assertRaises(ValueError): word_tokenize("หมอนทอง", engine="XX") # engine does not exist diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 2d1ca7a91..4a99b1676 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -57,6 +57,7 @@ def test_romanize(self): self.assertEqual(romanize(None), "") self.assertEqual(romanize(""), "") self.assertEqual(romanize("แมว"), "maeo") + self.assertEqual(romanize("แมว", engine="tltk"), "maeo") def test_romanize_royin_basic(self): for word in _BASIC_TESTS: @@ -136,6 +137,10 @@ def test_transliterate(self): self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") self.assertIsNotNone(transliterate("คน", engine="thaig2p")) self.assertIsNotNone(transliterate("แมว", engine="thaig2p")) + self.assertIsNotNone(transliterate("คน", engine="tltk_g2p")) + self.assertIsNotNone(transliterate("แมว", engine="tltk_g2p")) + self.assertIsNotNone(transliterate("คน", engine="tltk_ipa")) + self.assertIsNotNone(transliterate("แมว", engine="tltk_ipa")) self.assertIsNotNone(trans_list("คน")) self.assertIsNotNone(xsampa_list("คน")) From 7374cbbe727a6d6045073a1d76e699a092b2dd3f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 14:39:19 +0700 Subject: [PATCH 02/21] Update tltk.py --- pythainlp/tag/tltk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index 371075e56..e8f68bada 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- -from tltk.nlp import pos_tag_wordlist +from tltk import nlp from typing import List, Tuple +nlp.pos_load() def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: if corpus != "tnc": raise NotImplemented("tltk not support {0} corpus.".format(0)) - return pos_tag_wordlist(words) + return nlp.pos_tag_wordlist(words) From 45bb596fd88a5b52ab5aae6c4c7621d2e0d3c348 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 14:41:35 +0700 Subject: [PATCH 03/21] Fixed PEP8 --- pythainlp/spell/tltk.py | 1 + pythainlp/tag/tltk.py | 1 + pythainlp/tokenize/tltk.py | 2 +- pythainlp/transliterate/tltk.py | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/spell/tltk.py b/pythainlp/spell/tltk.py index 1aabc3f55..6a739b837 100644 --- a/pythainlp/spell/tltk.py +++ b/pythainlp/spell/tltk.py @@ -1,5 +1,6 @@ from tltk.nlp import spell_candidates from typing import List + def spell(text: str) -> List[str]: return spell_candidates(text) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index e8f68bada..c8243fc76 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -4,6 +4,7 @@ nlp.pos_load() + def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: if corpus != "tnc": raise NotImplemented("tltk not support {0} corpus.".format(0)) diff --git a/pythainlp/tokenize/tltk.py b/pythainlp/tokenize/tltk.py index e5f8da8d1..1ecd9c238 100644 --- a/pythainlp/tokenize/tltk.py +++ b/pythainlp/tokenize/tltk.py @@ -7,7 +7,7 @@ def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - _temp = tltk_segment(text).replace("","").replace(""," ") + _temp = tltk_segment(text).replace("", "").replace("", " ") return _temp.split('|') diff --git a/pythainlp/transliterate/tltk.py b/pythainlp/transliterate/tltk.py index 4e4999320..8795ce756 100644 --- a/pythainlp/transliterate/tltk.py +++ b/pythainlp/transliterate/tltk.py @@ -8,7 +8,7 @@ def romanize(text: str) -> str: def tltk_g2p(text: str) -> str: - _temp = g2p(text).split("")[1].replace("|","").replace("|", " ") + _temp = g2p(text).split("")[1].replace("|", "").replace("|", " ") return _temp.replace("", "") From 42002c7f67c75f97092d96c18b6e03ac3b0ddf47 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:00:37 +0700 Subject: [PATCH 04/21] Update pos_tag.py --- pythainlp/tag/pos_tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index ce2338481..8cd007987 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -105,7 +105,7 @@ def pos_tag( elif engine == "unigram" and corpus in _support_corpus: # default from pythainlp.tag.unigram import tag as tag_ else: - raise NotImplemented( + raise ValueError( "pos_tag not support {0} engine or {1} corpus.".format( engine, corpus From 7c4dedf3cb80f090ecc0cca14f006179eb1965ee Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:34:06 +0700 Subject: [PATCH 05/21] Add pythainlp.tag.tltk.get_ner --- docs/api/tag.rst | 1 + pythainlp/tag/tltk.py | 78 ++++++++++++++++++++++++++++++++++++++++++- tests/test_tag.py | 24 +++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/docs/api/tag.rst b/docs/api/tag.rst index 87cf0a766..2ab526c60 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -232,6 +232,7 @@ Modules .. autofunction:: chunk_parse .. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger :members: get_ner +.. autofunction:: pythainlp.tag.tltk.get_ner Tagger Engines -------------- diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index c8243fc76..aaff76206 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -1,11 +1,87 @@ # -*- coding: utf-8 -*- +from typing import List, Tuple, Union from tltk import nlp -from typing import List, Tuple +from pythainlp.tokenize import word_tokenize nlp.pos_load() +nlp.ner_load() def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: if corpus != "tnc": raise NotImplemented("tltk not support {0} corpus.".format(0)) return nlp.pos_tag_wordlist(words) + + +def get_ner( + text: str, pos: bool = True, tag: bool = False + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + """ + Named-entity recognizer from **TLTK** + + This function tags named-entitiy from text in IOB format. + + :param str text: text in Thai to be tagged + :param bool pos: To include POS tags in the results (`True`) or + exclude (`False`). The defualt value is `True` + :param bool tag: output like html tag. + :return: a list of tuple associated with tokenized word, NER tag, + POS tag (if the parameter `pos` is specified as `True`), + and output like html tag (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuple associated with tokenized + word and NER tag + :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str + + :Example: + + >>> from pythainlp.tag.tltk import get_ner + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง") + [('เขา', 'PRON', 'O'), + ('เรียน', 'VERB', 'O'), + ('ที่', 'SCONJ', 'O'), + ('โรงเรียน', 'NOUN', 'B-L'), + ('นางรอง', 'VERB', 'I-L')] + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False) + [('เขา', 'O'), + ('เรียน', 'O'), + ('ที่', 'O'), + ('โรงเรียน', 'B-L'), + ('นางรอง', 'I-L')] + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True) + 'เขาเรียนที่โรงเรียนนางรอง' + """ + if not text: + return [] + list_word = [] + for i in word_tokenize(text, engine="tltk"): + if i == " ": + i = "" + list_word.append(i) + _pos = nlp.pos_tag_wordlist(list_word) + sent_ner = nlp.ner(_pos) + if sent_ner[-1][0] == '': + del sent_ner[-1] + if tag: + temp = "" + sent = "" + for idx, (word, pos, ner) in enumerate(sent_ner): + if ner.startswith("B-") and temp != "": + sent += "" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "" + temp = "" + sent += word + + if idx == len(sent_ner) - 1 and temp != "": + sent += "" + + return sent + if pos == False: + return [(word, ner) for word, pos, ner in sent_ner] + return sent_ner diff --git a/tests/test_tag.py b/tests/test_tag.py index 854c559e8..e9b838d27 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -10,6 +10,7 @@ pos_tag, pos_tag_sents, unigram, + tltk, ) from pythainlp.tag.locations import tag_provinces from pythainlp.tag.named_entity import ThaiNameTagger @@ -102,6 +103,9 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") ) + self.assertIsNotNone( + pos_tag(tokens, engine="tltk") + ) self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) @@ -355,3 +359,23 @@ def test_ner(self): # ("เช้า", "I-TIME"), # ], # ) + + def test_tltk_ner(self): + self.assertEqual(tltk.get_ner(""), []) + self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า")) + self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า", pos=False)) + self.assertIsNotNone( + tltk.get_ner( + """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น + วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง + จังหวัดหนองคาย 43000""" + ) + ) + self.assertIsNotNone( + tltk.get_ner( + """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น + วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง + จังหวัดหนองคาย 43000""", + tag=True, + ) + ) From e757235618e630c7bd37e89078f891a5eef1d7e1 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:36:38 +0700 Subject: [PATCH 06/21] Update test_tag.py --- tests/test_tag.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index e9b838d27..bbbe10d16 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -100,9 +100,6 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="wangchanberta", corpus="lst20") ) - self.assertIsNotNone( - pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") - ) self.assertIsNotNone( pos_tag(tokens, engine="tltk") ) @@ -116,6 +113,10 @@ def test_pos_tag(self): [("แมว", "NCMN"), ("วิ่ง", "VACT")], ], ) + with self.assertRaises(ValueError): + self.assertIsNotNone( + pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") + ) # ### pythainlp.tag.PerceptronTagger From f13f0c3d1cef934e6f803c3b2694cc538d696beb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:38:42 +0700 Subject: [PATCH 07/21] Fixed PEP8 --- pythainlp/tag/tltk.py | 6 ++++-- tests/test_tag.py | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index aaff76206..a5b5be1ce 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -15,7 +15,9 @@ def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: def get_ner( text: str, pos: bool = True, tag: bool = False - ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + ) -> Union[ + List[Tuple[str, str]], List[Tuple[str, str, str]], str + ]: """ Named-entity recognizer from **TLTK** @@ -82,6 +84,6 @@ def get_ner( sent += "" return sent - if pos == False: + if pos is False: return [(word, ner) for word, pos, ner in sent_ner] return sent_ner diff --git a/tests/test_tag.py b/tests/test_tag.py index bbbe10d16..d84e8b1f2 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -368,14 +368,12 @@ def test_tltk_ner(self): self.assertIsNotNone( tltk.get_ner( """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น - วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง จังหวัดหนองคาย 43000""" ) ) self.assertIsNotNone( tltk.get_ner( """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น - วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง จังหวัดหนองคาย 43000""", tag=True, ) From e46ef880b4bae14e570ff52d5c667fb44923446b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:39:30 +0700 Subject: [PATCH 08/21] Update tltk.py --- pythainlp/tag/tltk.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index a5b5be1ce..b3c534b70 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -15,9 +15,7 @@ def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: def get_ner( text: str, pos: bool = True, tag: bool = False - ) -> Union[ - List[Tuple[str, str]], List[Tuple[str, str, str]], str - ]: + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ Named-entity recognizer from **TLTK** From 462ad5274ab0d01413379f695c3b599f0c637676 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 15:40:24 +0700 Subject: [PATCH 09/21] Update tltk.py --- pythainlp/tag/tltk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index b3c534b70..92e6a0bba 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -14,8 +14,10 @@ def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: def get_ner( - text: str, pos: bool = True, tag: bool = False - ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: + text: str, + pos: bool = True, + tag: bool = False +) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ Named-entity recognizer from **TLTK** From 2b229823653511911b7c1aff8fe8d175867573ca Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 16:08:55 +0700 Subject: [PATCH 10/21] Update core.py --- pythainlp/spell/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py index ac515b576..65d6ca54c 100644 --- a/pythainlp/spell/core.py +++ b/pythainlp/spell/core.py @@ -62,6 +62,9 @@ def spell(word: str, engine: str = "pn") -> List[str]: elif engine == "symspellpy": from pythainlp.spell.symspellpy import spell as SPELL_CHECKER text_correct = SPELL_CHECKER(word) + elif engine == "tltk": + from pythainlp.spell.tltk import spell as SPELL_CHECKER + text_correct = SPELL_CHECKER(word) else: text_correct = DEFAULT_SPELL_CHECKER.spell(word) From cab2b4b28fb2bbb933145ab8544511580dd76ca6 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 16:33:10 +0700 Subject: [PATCH 11/21] Add tltk sent_tokenize --- pythainlp/tokenize/core.py | 5 +++++ pythainlp/tokenize/tltk.py | 22 +++++++++++++++++++--- tests/test_tokenize.py | 20 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1cb2f43b3..41f03f2e6 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -221,6 +221,7 @@ def sent_tokenize( * *whitespace+newline* - split by whitespaces and newline. * *whitespace* - split by whitespaces. Specifiaclly, with \ :class:`regex` pattern ``r" +"`` + * *tltk* - split by `TLTK `_., :Example: Split the text based on *whitespace*:: @@ -277,6 +278,10 @@ def sent_tokenize( segments = re.split(r" +", text, re.U) elif engine == "whitespace+newline": segments = text.split() + elif engine == "tltk": + from pythainlp.tokenize.tltk import sent_tokenize as segment + + segments = segment(text) else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/tokenize/tltk.py b/pythainlp/tokenize/tltk.py index 1ecd9c238..63e936b7a 100644 --- a/pythainlp/tokenize/tltk.py +++ b/pythainlp/tokenize/tltk.py @@ -7,12 +7,28 @@ def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - _temp = tltk_segment(text).replace("", "").replace("", " ") - return _temp.split('|') + text = text.replace(" ", "") + _temp = tltk_segment(text).replace("", " ").replace("", "") + _temp =_temp.split('|') + if _temp[-1] == "": + del _temp[-1] + return _temp def syllable_tokenize(text: str) -> List[str]: if not text or not isinstance(text, str): return [] _temp = syl_segment(text) - return _temp.split('~') + _temp = _temp.split('~') + if _temp[-1] == "": + del _temp[-1] + return _temp + + +def sent_tokenize(text: str) -> List[str]: + text = text.replace(" ", "") + _temp = tltk_segment(text).replace("", " ").replace("|", "") + _temp =_temp.split('') + if _temp[-1] == "": + del _temp[-1] + return _temp diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index fd6cc22ae..1b5dd050d 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -21,6 +21,7 @@ tcc, word_tokenize, sefr_cut, + tltk, ) from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize from pythainlp.util import dict_trie @@ -260,6 +261,15 @@ def test_sent_tokenize(self): self.assertIsNotNone( sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), ) + self.assertIsNotNone( + sent_tokenize(sent_1, engine="tltk",), + ) + self.assertIsNotNone( + sent_tokenize(sent_2, engine="tltk",), + ) + self.assertIsNotNone( + sent_tokenize(sent_3, engine="tltk",), + ) self.assertFalse( " " in sent_tokenize( @@ -435,6 +445,16 @@ def test_icu(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) + def test_tltk(self): + self.assertEqual(tltk.segment(None), []) + self.assertEqual(tltk.segment(""), []) + self.assertEqual( + syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk"), + ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], + ) + self.assertEqual(tltk.syllable_tokenize(None), []) + self.assertEqual(tltk.syllable_tokenize(""), []) + def test_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) From ead158cab2523b5bdec82215d96907c1f57e7337 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 16:34:04 +0700 Subject: [PATCH 12/21] Fixed PEP8 --- pythainlp/tokenize/tltk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/tltk.py b/pythainlp/tokenize/tltk.py index 63e936b7a..2199edfa8 100644 --- a/pythainlp/tokenize/tltk.py +++ b/pythainlp/tokenize/tltk.py @@ -9,7 +9,7 @@ def segment(text: str) -> List[str]: return [] text = text.replace(" ", "") _temp = tltk_segment(text).replace("", " ").replace("", "") - _temp =_temp.split('|') + _temp = _temp.split('|') if _temp[-1] == "": del _temp[-1] return _temp @@ -28,7 +28,7 @@ def syllable_tokenize(text: str) -> List[str]: def sent_tokenize(text: str) -> List[str]: text = text.replace(" ", "") _temp = tltk_segment(text).replace("", " ").replace("|", "") - _temp =_temp.split('') + _temp = _temp.split('') if _temp[-1] == "": del _temp[-1] return _temp From 86c43cdc1bddc8cf869e293b1deadd5126631410 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 16:47:12 +0700 Subject: [PATCH 13/21] Update test_spell.py --- tests/test_spell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_spell.py b/tests/test_spell.py index e59474a06..9dd3a25ba 100644 --- a/tests/test_spell.py +++ b/tests/test_spell.py @@ -44,7 +44,7 @@ def test_spell(self): self.assertIsInstance(result, list) self.assertGreater(len(result), 0) - result = spell("เกสมร์", engine="tltk") + result = spell("เดก", engine="tltk") self.assertIsInstance(result, list) self.assertGreater(len(result), 0) From 26317c63c074b7d002234ed914fc1b026676f91f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 17:01:47 +0700 Subject: [PATCH 14/21] Update test_tokenize.py --- tests/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 1b5dd050d..4083e9174 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -449,7 +449,7 @@ def test_tltk(self): self.assertEqual(tltk.segment(None), []) self.assertEqual(tltk.segment(""), []) self.assertEqual( - syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk"), + tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk"), ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], ) self.assertEqual(tltk.syllable_tokenize(None), []) From 44de92824501f17ec1558b275cbc8bc08477aa83 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 17:06:01 +0700 Subject: [PATCH 15/21] Update test_tokenize.py --- tests/test_tokenize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 4083e9174..2a9881401 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -449,7 +449,9 @@ def test_tltk(self): self.assertEqual(tltk.segment(None), []) self.assertEqual(tltk.segment(""), []) self.assertEqual( - tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk"), + tltk.syllable_tokenize( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk" + ), ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], ) self.assertEqual(tltk.syllable_tokenize(None), []) From 9e2ea86f3c121bdf682f4811e50182764e1e4113 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 17:32:54 +0700 Subject: [PATCH 16/21] Update test_tokenize.py --- tests/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 2a9881401..6930ca704 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -450,7 +450,7 @@ def test_tltk(self): self.assertEqual(tltk.segment(""), []) self.assertEqual( tltk.syllable_tokenize( - "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tltk" + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" ), ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], ) From 7c6f823e725e795214efd9909091a566cf8d3d0d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 18:20:44 +0700 Subject: [PATCH 17/21] Update test_tokenize.py --- tests/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 6930ca704..6e4195260 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -452,7 +452,7 @@ def test_tltk(self): tltk.syllable_tokenize( "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" ), - ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], + ['ฉัน', 'รัก', 'ภา', 'ษา', 'ไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], ) self.assertEqual(tltk.syllable_tokenize(None), []) self.assertEqual(tltk.syllable_tokenize(""), []) From f1f8f517e6699716d88931091a29fbd1a6b2c9fb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 18:22:30 +0700 Subject: [PATCH 18/21] Update test_tokenize.py --- tests/test_tokenize.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 6e4195260..6d1f54bd0 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -452,7 +452,18 @@ def test_tltk(self): tltk.syllable_tokenize( "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" ), - ['ฉัน', 'รัก', 'ภา', 'ษา', 'ไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], + [ + 'ฉัน', + 'รัก', + 'ภา', + 'ษา', + 'ไทย', + 'เพราะ', + 'ฉัน', + 'เป็น', + 'คน', + 'ไทย' + ], ) self.assertEqual(tltk.syllable_tokenize(None), []) self.assertEqual(tltk.syllable_tokenize(""), []) From 2c9a9ea9693882bf026e29db9a7fd3903b717f58 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 20:01:00 +0700 Subject: [PATCH 19/21] Add post_process tltk ner --- pythainlp/tag/tltk.py | 10 +++++++--- tests/test_tag.py | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index 92e6a0bba..f61cbff8f 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -13,6 +13,10 @@ def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: return nlp.pos_tag_wordlist(words) +def _post_process(text: str) -> str: + return text.replace("", " ") + + def get_ner( text: str, pos: bool = True, @@ -61,9 +65,9 @@ def get_ner( i = "" list_word.append(i) _pos = nlp.pos_tag_wordlist(list_word) - sent_ner = nlp.ner(_pos) - if sent_ner[-1][0] == '': - del sent_ner[-1] + sent_ner = [ + (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos) + ] if tag: temp = "" sent = "" diff --git a/tests/test_tag.py b/tests/test_tag.py index d84e8b1f2..c300527da 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -365,6 +365,11 @@ def test_tltk_ner(self): self.assertEqual(tltk.get_ner(""), []) self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า")) self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า", pos=False)) + self.assertIsNotNone( + tltk.get_ner( + "พลเอกประยุกธ์ จันทร์โอชา ประกาศในฐานะหัวหน้า" + ) + ) self.assertIsNotNone( tltk.get_ner( """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น From 17d3a76f7a519229d01e3f6ad88715be0c212430 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 20:10:50 +0700 Subject: [PATCH 20/21] Add test --- pythainlp/tag/tltk.py | 2 +- tests/test_tag.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index f61cbff8f..bfb5bacb9 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -9,7 +9,7 @@ def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: if corpus != "tnc": - raise NotImplemented("tltk not support {0} corpus.".format(0)) + raise ValueError("tltk not support {0} corpus.".format(0)) return nlp.pos_tag_wordlist(words) diff --git a/tests/test_tag.py b/tests/test_tag.py index c300527da..d9c4c25e5 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -117,6 +117,9 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") ) + self.assertIsNotNone( + tltk.pos_tag(tokens, corpus="lst20") + ) # ### pythainlp.tag.PerceptronTagger From f7d99eb742b6f88422e7cdc5fe89c3d4407a1005 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 29 Jul 2021 20:33:23 +0700 Subject: [PATCH 21/21] Add test --- tests/test_spell.py | 6 +++++- tests/test_summarize.py | 2 ++ tests/test_tag.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_spell.py b/tests/test_spell.py index 9dd3a25ba..bb273709a 100644 --- a/tests/test_spell.py +++ b/tests/test_spell.py @@ -7,7 +7,8 @@ correct, spell, spell_sent, - correct_sent + correct_sent, + symspellpy, ) @@ -131,3 +132,6 @@ def test_correct_sent(self): self.assertIsNotNone( correct_sent(self.spell_sent, engine="symspellpy") ) + self.assertIsNotNone( + symspellpy.correct_sent(self.spell_sent) + ) diff --git a/tests/test_summarize.py b/tests/test_summarize.py index f5cb0161f..2c36ebced 100644 --- a/tests/test_summarize.py +++ b/tests/test_summarize.py @@ -24,3 +24,5 @@ def test_summarize(self): self.assertIsNotNone(summarize([])) self.assertIsNotNone(summarize(text, 1, engine="mt5-small")) self.assertIsNotNone(summarize(text, 1, engine="XX")) + with self.assertRaises(ValueError): + self.assertIsNotNone(summarize(text, 1, engine="mt5-cat")) diff --git a/tests/test_tag.py b/tests/test_tag.py index d9c4c25e5..68232e30c 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -2,6 +2,7 @@ import unittest from os import path +from pythainlp import tag from pythainlp.tag import ( chunk_parse, @@ -117,6 +118,7 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") ) + with self.assertRaises(ValueError): self.assertIsNotNone( tltk.pos_tag(tokens, corpus="lst20") ) @@ -373,6 +375,12 @@ def test_tltk_ner(self): "พลเอกประยุกธ์ จันทร์โอชา ประกาศในฐานะหัวหน้า" ) ) + self.assertIsNotNone( + tltk.get_ner( + "พลเอกประยุกธ์ จันทร์โอชา ประกาศในฐานะหัวหน้า", + tag=True, + ) + ) self.assertIsNotNone( tltk.get_ner( """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น