diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py index 30a1e90fc..905ecd87c 100644 --- a/pythainlp/augment/wordnet.py +++ b/pythainlp/augment/wordnet.py @@ -6,11 +6,12 @@ "WordNetAug", "postype2wordnet", ] - +import warnings from pythainlp.corpus import wordnet from collections import OrderedDict from pythainlp.tokenize import word_tokenize from pythainlp.tag import pos_tag +from pythainlp.util.messages import deprecation_message from typing import List from nltk.corpus import wordnet as wn import itertools @@ -127,9 +128,15 @@ def postype2wordnet(pos: str, corpus: str): * *lst20* - LST20 Corpus * *orchid* - Orchid Corpus """ - if corpus not in ['lst20', 'orchid']: + if corpus not in ["lst20", "orchid"]: return None - if corpus == 'lst20': + if corpus == "lst20": + dep_msg = deprecation_message( + [("corpus", "lst20")], + "function `wordnet.postype2wordnet`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) return lst20[pos] else: return orchid[pos] @@ -139,14 +146,12 @@ class WordNetAug: """ Text Augment using wordnet """ + def __init__(self): pass def find_synonyms( - self, - word: str, - pos: str = None, - postag_corpus: str = "lst20" + self, word: str, pos: str = None, postag_corpus: str = "lst20" ) -> List[str]: """ Find synonyms from wordnet @@ -162,13 +167,13 @@ def find_synonyms( self.list_synsets = wordnet.synsets(word) else: self.p2w_pos = postype2wordnet(pos, postag_corpus) - if self.p2w_pos != '': + if self.p2w_pos != "": self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos) else: self.list_synsets = wordnet.synsets(word) for self.synset in wordnet.synsets(word): - for self.syn in self.synset.lemma_names(lang='tha'): + for self.syn in self.synset.lemma_names(lang="tha"): self.synonyms.append(self.syn) self.synonyms_without_duplicates = list( @@ -182,7 +187,7 @@ def augment( tokenize: object = word_tokenize, max_syn_sent: int = 6, postag: bool = True, - postag_corpus: str = "lst20" + postag_corpus: str = "lst20", ) -> List[List[str]]: """ Text Augment using wordnet @@ -210,10 +215,19 @@ def augment( ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'), ('เรา', 'ชอบ', 'ไปยัง', 'รร.')] """ + if postag_corpus.startswith("lst20"): + dep_msg = deprecation_message( + [("postag_corpus", "lst20")], + "method `WordNetAug.augment`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) + new_sentences = [] self.list_words = tokenize(sentence) self.list_synonym = [] self.p_all = 1 + if postag: self.list_pos = pos_tag(self.list_words, corpus=postag_corpus) for word, pos in self.list_pos: diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index fd5930e48..c861d6c46 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -5,6 +5,8 @@ import warnings from typing import List, Tuple, Union +from pythainlp.util.messages import deprecation_message + class NER: """ @@ -30,7 +32,15 @@ class NER: **Note**: for tltk engine, It's support ner model from tltk only. """ + def __init__(self, engine: str, corpus: str = "thainer") -> None: + if any([arg.startswith("lst20") for arg in (engine, corpus)]): + dep_msg = deprecation_message( + [("engine", "lst20_onnx"), ("corpus", "lst20")], + "`named_entity.NER`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) self.load_engine(engine=engine, corpus=corpus) def load_engine(self, engine: str, corpus: str) -> None: @@ -38,35 +48,37 @@ def load_engine(self, engine: str, corpus: str) -> None: self.engine = None if engine == "thainer" and corpus == "thainer": from pythainlp.tag.thainer import ThaiNameTagger + self.engine = ThaiNameTagger() elif engine == "lst20_onnx": from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX + self.engine = LST20_NER_ONNX() elif engine == "wangchanberta": from pythainlp.wangchanberta import ThaiNameTagger - if corpus=="lst20": - warnings.warn(""" + + if corpus == "lst20": + warnings.warn( + """ LST20 corpus are free for research and open source only.\n If you want to use in Commercial use, please contract NECTEC.\n https://www.facebook.com/dancearmy/posts/10157641945708284 - """) + """ + ) self.engine = ThaiNameTagger(dataset_name=corpus) elif engine == "tltk": from pythainlp.tag import tltk + self.engine = tltk else: raise ValueError( "NER class not support {0} engine or {1} corpus.".format( - engine, - corpus + engine, corpus ) ) def tag( - self, - text, - pos=True, - tag=False + self, text, pos=True, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named-entitiy from text in IOB format. @@ -103,7 +115,10 @@ def tag( """wangchanberta is not support part-of-speech tag. It have not part-of-speech tag in output.""" ) - if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx": + if ( + self.name_engine == "wangchanberta" + or self.name_engine == "lst20_onnx" + ): return self.engine.get_ner(text, tag=tag) else: return self.engine.get_ner(text, tag=tag, pos=pos) @@ -119,11 +134,13 @@ class NNER: **Options for engine** * *thai_nner* - Thai NER engine """ + def __init__(self, engine: str = "thai_nner") -> None: self.load_engine(engine) def load_engine(self, engine: str = "thai_nner") -> None: from pythainlp.tag.thai_nner import Thai_NNER + self.engine = Thai_NNER() def tag(self, text) -> Tuple[List[str], List[dict]]: diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 7db88f750..6c502a842 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -8,6 +8,7 @@ from pythainlp.corpus import corpus_path, get_corpus_path from pythainlp.tag import PerceptronTagger, lst20, orchid +from pythainlp.util.messages import deprecation_message _ORCHID_FILENAME = "pos_orchid_perceptron.json" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) @@ -38,11 +39,13 @@ def _pud_tagger(): def _lst20_tagger(): global _LST20_TAGGER - warnings.warn(""" + warnings.warn( + """ LST20 corpus are free for research and open source only.\n If you want to use in Commercial use, please contract NECTEC.\n https://www.facebook.com/dancearmy/posts/10157641945708284 - """) + """ + ) if not _LST20_TAGGER: path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4") _LST20_TAGGER = PerceptronTagger(path=path) @@ -69,6 +72,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: word_tags = _orchid_tagger().tag(words) word_tags = orchid.post_process(word_tags, to_ud) elif corpus == "lst20" or corpus == "lst20_ud": + dep_msg = deprecation_message( + [("postag_corpus", "lst20"), ("postag_corpus", "lst20_ud")], + "function `perceptron.tag`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) words = lst20.pre_process(words) word_tags = _lst20_tagger().tag(words) word_tags = lst20.post_process(word_tags, to_ud) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 68410d741..b678595bd 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- from typing import List, Tuple +import warnings + +from pythainlp.util.messages import deprecation_message def pos_tag( - words: List[str], - engine: str = "perceptron", - corpus: str = "orchid" + words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: """ Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'. @@ -98,21 +99,29 @@ def pos_tag( _support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"] + if corpus.startswith("lst20"): + dep_msg = deprecation_message( + [("corpus", "lst20"), ("corpus", "lst20_ud")], + "function `pos_tag.pos_tag`", + "4.0.0", + ) + if engine == "perceptron" and corpus in _support_corpus: from pythainlp.tag.perceptron import tag as tag_ elif engine == "wangchanberta" and corpus == "lst20": from pythainlp.wangchanberta.postag import pos_tag as tag_ - words = ''.join(words) + + words = "".join(words) elif engine == "tltk": from pythainlp.tag.tltk import pos_tag as tag_ + corpus = "tnc" elif engine == "unigram" and corpus in _support_corpus: # default from pythainlp.tag.unigram import tag as tag_ else: raise ValueError( "pos_tag not support {0} engine or {1} corpus.".format( - engine, - corpus + engine, corpus ) ) @@ -169,4 +178,12 @@ def pos_tag_sents( if not sentences: return [] + if corpus.startswith("lst20"): + dep_msg = deprecation_message( + [("corpus", "lst20"), ("corpus", "lst20_ud")], + "function `pos_tag.pos_tag_sents`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) + return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences] diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index 7c112b411..87615279c 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -9,6 +9,7 @@ from pythainlp.corpus import corpus_path, get_corpus_path from pythainlp.tag import lst20, orchid +from pythainlp.util.messages import deprecation_message _ORCHID_FILENAME = "pos_orchid_unigram.json" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) @@ -42,11 +43,13 @@ def _pud_tagger(): def _lst20_tagger(): global _LST20_TAGGER - warnings.warn(""" + warnings.warn( + """ LST20 corpus are free for research and open source only.\n If you want to use in Commercial use, please contract NECTEC.\n https://www.facebook.com/dancearmy/posts/10157641945708284 - """) + """ + ) if not _LST20_TAGGER: path = get_corpus_path(_LST20_TAGGER_NAME) with open(path, encoding="utf-8-sig") as fh: @@ -84,6 +87,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: word_tags = _find_tag(words, _orchid_tagger()) word_tags = orchid.post_process(word_tags, to_ud) elif corpus == "lst20" or corpus == "lst20_ud": + dep_msg = deprecation_message( + [("corpus", "lst20"), ("corpus", "lst20_ud")], + "function `unigram.tag`", + "4.0.0", + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) words = lst20.pre_process(words) word_tags = _find_tag(words, _lst20_tagger()) word_tags = lst20.post_process(word_tags, to_ud) diff --git a/pythainlp/util/messages.py b/pythainlp/util/messages.py new file mode 100644 index 000000000..f0f7eb1ae --- /dev/null +++ b/pythainlp/util/messages.py @@ -0,0 +1,36 @@ +from typing import List, Tuple +from warnings import warn + + +def deprecation_message( + deprecated_items: List[Tuple[str, str]], + module_name: str, + last_effective_version: str, + recommended_action: str = "", +): + + dep_item_names = list(set([itm for itm, _ in deprecated_items])) + is_same_item = len(dep_item_names) == 1 + if is_same_item: + single_item = len(deprecated_items) == 1 + values = ( + deprecated_items[0][1] + if single_item + else [val for _, val in deprecated_items] + ) + dep_msg = f"{dep_item_names[0]}={repr(values)}" + else: + dep_msg = ", ".join( + [ + f"{dep_item}={repr(dep_value)}" + for dep_item, dep_value in deprecated_items + ] + ) + + dep_msg += f" of {module_name}" + dep_msg += f" will be deprecated in version {last_effective_version}." + + if recommended_action: + dep_msg += " " + recommended_action + + return dep_msg diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 757df3fe0..bc00eabff 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -1,24 +1,25 @@ # -*- coding: utf-8 -*- from typing import List, Tuple, Union import re +import warnings from transformers import ( CamembertTokenizer, pipeline, ) +from pythainlp.util.messages import deprecation_message + _model_name = "wangchanberta-base-att-spm-uncased" _tokenizer = CamembertTokenizer.from_pretrained( - f'airesearch/{_model_name}', - revision='main') + f"airesearch/{_model_name}", revision="main" +) if _model_name == "wangchanberta-base-att-spm-uncased": - _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + _tokenizer.additional_special_tokens = ["NOTUSED", "NOTUSED", "<_>"] class ThaiNameTagger: def __init__( - self, - dataset_name: str = "thainer", - grouped_entities: bool = True + self, dataset_name: str = "thainer", grouped_entities: bool = True ): """ This function tags named-entitiy from text in IOB format. @@ -31,23 +32,29 @@ def __init__( * *lst20* - LST20 Corpus :param bool grouped_entities: grouped entities """ + if dataset_name == "lst20": + dep_msg = deprecation_message( + [("dataset_name", "lst20")], "class `ThaiNameTagger`", "4.0.0" + ) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) self.dataset_name = dataset_name self.grouped_entities = grouped_entities self.classify_tokens = pipeline( - task='ner', + task="ner", tokenizer=_tokenizer, - model=f'airesearch/{_model_name}', - revision=f'finetuned@{self.dataset_name}-ner', + model=f"airesearch/{_model_name}", + revision=f"finetuned@{self.dataset_name}-ner", ignore_labels=[], - grouped_entities=self.grouped_entities) + grouped_entities=self.grouped_entities, + ) def _IOB(self, tag): if tag != "O": - return "B-"+tag + return "B-" + tag return "O" def _clear_tag(self, tag): - return tag.replace('B-', '').replace('I-', '') + return tag.replace("B-", "").replace("I-", "") def get_ner( self, text: str, tag: bool = False @@ -72,40 +79,41 @@ def get_ner( if self.grouped_entities and self.dataset_name == "thainer": self.sent_ner = [ ( - i['word'].replace("<_>", " ").replace('▁', ''), - self._IOB(i['entity_group']) - ) for i in self.json_ner + i["word"].replace("<_>", " ").replace("▁", ""), + self._IOB(i["entity_group"]), + ) + for i in self.json_ner ] elif self.dataset_name == "thainer": self.sent_ner = [ - ( - i['word'].replace("<_>", " ").replace('▁', ''), i['entity'] - ) for i in self.json_ner if i['word'] != '▁' + (i["word"].replace("<_>", " ").replace("▁", ""), i["entity"]) + for i in self.json_ner + if i["word"] != "▁" ] elif self.grouped_entities and self.dataset_name == "lst20": self.sent_ner = [ ( - i['word'].replace("<_>", " ").replace('▁', ''), - i['entity_group'].replace('_', '-').replace('E-', 'I-') - ) for i in self.json_ner + i["word"].replace("<_>", " ").replace("▁", ""), + i["entity_group"].replace("_", "-").replace("E-", "I-"), + ) + for i in self.json_ner ] else: self.sent_ner = [ ( - i['word'].replace("<_>", " ").replace('▁', ''), - i['entity'].replace('_', '-').replace('E-', 'I-') - ) for i in self.json_ner + i["word"].replace("<_>", " ").replace("▁", ""), + i["entity"].replace("_", "-").replace("E-", "I-"), + ) + for i in self.json_ner ] - if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1: + if self.sent_ner[0][0] == "" and len(self.sent_ner) > 1: self.sent_ner = self.sent_ner[1:] for idx, (word, ner) in enumerate(self.sent_ner): if idx > 0 and ner.startswith("B-"): - if ( - self._clear_tag(ner) == self._clear_tag( - self.sent_ner[idx-1][1] - ) + if self._clear_tag(ner) == self._clear_tag( + self.sent_ner[idx - 1][1] ): - self.sent_ner[idx] = (word, ner.replace('B-', 'I-')) + self.sent_ner[idx] = (word, ner.replace("B-", "I-")) if tag: temp = "" sent = ""