diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py
index 30a1e90fc..905ecd87c 100644
--- a/pythainlp/augment/wordnet.py
+++ b/pythainlp/augment/wordnet.py
@@ -6,11 +6,12 @@
"WordNetAug",
"postype2wordnet",
]
-
+import warnings
from pythainlp.corpus import wordnet
from collections import OrderedDict
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
+from pythainlp.util.messages import deprecation_message
from typing import List
from nltk.corpus import wordnet as wn
import itertools
@@ -127,9 +128,15 @@ def postype2wordnet(pos: str, corpus: str):
* *lst20* - LST20 Corpus
* *orchid* - Orchid Corpus
"""
- if corpus not in ['lst20', 'orchid']:
+ if corpus not in ["lst20", "orchid"]:
return None
- if corpus == 'lst20':
+ if corpus == "lst20":
+ dep_msg = deprecation_message(
+ [("corpus", "lst20")],
+ "function `wordnet.postype2wordnet`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
return lst20[pos]
else:
return orchid[pos]
@@ -139,14 +146,12 @@ class WordNetAug:
"""
Text Augment using wordnet
"""
+
def __init__(self):
pass
def find_synonyms(
- self,
- word: str,
- pos: str = None,
- postag_corpus: str = "lst20"
+ self, word: str, pos: str = None, postag_corpus: str = "lst20"
) -> List[str]:
"""
Find synonyms from wordnet
@@ -162,13 +167,13 @@ def find_synonyms(
self.list_synsets = wordnet.synsets(word)
else:
self.p2w_pos = postype2wordnet(pos, postag_corpus)
- if self.p2w_pos != '':
+ if self.p2w_pos != "":
self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
else:
self.list_synsets = wordnet.synsets(word)
for self.synset in wordnet.synsets(word):
- for self.syn in self.synset.lemma_names(lang='tha'):
+ for self.syn in self.synset.lemma_names(lang="tha"):
self.synonyms.append(self.syn)
self.synonyms_without_duplicates = list(
@@ -182,7 +187,7 @@ def augment(
tokenize: object = word_tokenize,
max_syn_sent: int = 6,
postag: bool = True,
- postag_corpus: str = "lst20"
+ postag_corpus: str = "lst20",
) -> List[List[str]]:
"""
Text Augment using wordnet
@@ -210,10 +215,19 @@ def augment(
('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
"""
+ if postag_corpus.startswith("lst20"):
+ dep_msg = deprecation_message(
+ [("postag_corpus", "lst20")],
+ "method `WordNetAug.augment`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
+
new_sentences = []
self.list_words = tokenize(sentence)
self.list_synonym = []
self.p_all = 1
+
if postag:
self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
for word, pos in self.list_pos:
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
index fd5930e48..c861d6c46 100644
--- a/pythainlp/tag/named_entity.py
+++ b/pythainlp/tag/named_entity.py
@@ -5,6 +5,8 @@
import warnings
from typing import List, Tuple, Union
+from pythainlp.util.messages import deprecation_message
+
class NER:
"""
@@ -30,7 +32,15 @@ class NER:
**Note**: for tltk engine, It's support ner model from tltk only.
"""
+
def __init__(self, engine: str, corpus: str = "thainer") -> None:
+ if any([arg.startswith("lst20") for arg in (engine, corpus)]):
+ dep_msg = deprecation_message(
+ [("engine", "lst20_onnx"), ("corpus", "lst20")],
+ "`named_entity.NER`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
self.load_engine(engine=engine, corpus=corpus)
def load_engine(self, engine: str, corpus: str) -> None:
@@ -38,35 +48,37 @@ def load_engine(self, engine: str, corpus: str) -> None:
self.engine = None
if engine == "thainer" and corpus == "thainer":
from pythainlp.tag.thainer import ThaiNameTagger
+
self.engine = ThaiNameTagger()
elif engine == "lst20_onnx":
from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX
+
self.engine = LST20_NER_ONNX()
elif engine == "wangchanberta":
from pythainlp.wangchanberta import ThaiNameTagger
- if corpus=="lst20":
- warnings.warn("""
+
+ if corpus == "lst20":
+ warnings.warn(
+ """
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
- """)
+ """
+ )
self.engine = ThaiNameTagger(dataset_name=corpus)
elif engine == "tltk":
from pythainlp.tag import tltk
+
self.engine = tltk
else:
raise ValueError(
"NER class not support {0} engine or {1} corpus.".format(
- engine,
- corpus
+ engine, corpus
)
)
def tag(
- self,
- text,
- pos=True,
- tag=False
+ self, text, pos=True, tag=False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named-entitiy from text in IOB format.
@@ -103,7 +115,10 @@ def tag(
"""wangchanberta is not support part-of-speech tag.
It have not part-of-speech tag in output."""
)
- if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx":
+ if (
+ self.name_engine == "wangchanberta"
+ or self.name_engine == "lst20_onnx"
+ ):
return self.engine.get_ner(text, tag=tag)
else:
return self.engine.get_ner(text, tag=tag, pos=pos)
@@ -119,11 +134,13 @@ class NNER:
**Options for engine**
* *thai_nner* - Thai NER engine
"""
+
def __init__(self, engine: str = "thai_nner") -> None:
self.load_engine(engine)
def load_engine(self, engine: str = "thai_nner") -> None:
from pythainlp.tag.thai_nner import Thai_NNER
+
self.engine = Thai_NNER()
def tag(self, text) -> Tuple[List[str], List[dict]]:
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
index 7db88f750..6c502a842 100644
--- a/pythainlp/tag/perceptron.py
+++ b/pythainlp/tag/perceptron.py
@@ -8,6 +8,7 @@
from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import PerceptronTagger, lst20, orchid
+from pythainlp.util.messages import deprecation_message
_ORCHID_FILENAME = "pos_orchid_perceptron.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -38,11 +39,13 @@ def _pud_tagger():
def _lst20_tagger():
global _LST20_TAGGER
- warnings.warn("""
+ warnings.warn(
+ """
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
- """)
+ """
+ )
if not _LST20_TAGGER:
path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
_LST20_TAGGER = PerceptronTagger(path=path)
@@ -69,6 +72,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
word_tags = _orchid_tagger().tag(words)
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "lst20" or corpus == "lst20_ud":
+ dep_msg = deprecation_message(
+ [("postag_corpus", "lst20"), ("postag_corpus", "lst20_ud")],
+ "function `perceptron.tag`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
words = lst20.pre_process(words)
word_tags = _lst20_tagger().tag(words)
word_tags = lst20.post_process(word_tags, to_ud)
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index 68410d741..b678595bd 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple
+import warnings
+
+from pythainlp.util.messages import deprecation_message
def pos_tag(
- words: List[str],
- engine: str = "perceptron",
- corpus: str = "orchid"
+ words: List[str], engine: str = "perceptron", corpus: str = "orchid"
) -> List[Tuple[str, str]]:
"""
Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'.
@@ -98,21 +99,29 @@ def pos_tag(
_support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]
+ if corpus.startswith("lst20"):
+ dep_msg = deprecation_message(
+ [("corpus", "lst20"), ("corpus", "lst20_ud")],
+ "function `pos_tag.pos_tag`",
+ "4.0.0",
+ )
+
if engine == "perceptron" and corpus in _support_corpus:
from pythainlp.tag.perceptron import tag as tag_
elif engine == "wangchanberta" and corpus == "lst20":
from pythainlp.wangchanberta.postag import pos_tag as tag_
- words = ''.join(words)
+
+ words = "".join(words)
elif engine == "tltk":
from pythainlp.tag.tltk import pos_tag as tag_
+
corpus = "tnc"
elif engine == "unigram" and corpus in _support_corpus: # default
from pythainlp.tag.unigram import tag as tag_
else:
raise ValueError(
"pos_tag not support {0} engine or {1} corpus.".format(
- engine,
- corpus
+ engine, corpus
)
)
@@ -169,4 +178,12 @@ def pos_tag_sents(
if not sentences:
return []
+ if corpus.startswith("lst20"):
+ dep_msg = deprecation_message(
+ [("corpus", "lst20"), ("corpus", "lst20_ud")],
+ "function `pos_tag.pos_tag_sents`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
+
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
index 7c112b411..87615279c 100644
--- a/pythainlp/tag/unigram.py
+++ b/pythainlp/tag/unigram.py
@@ -9,6 +9,7 @@
from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import lst20, orchid
+from pythainlp.util.messages import deprecation_message
_ORCHID_FILENAME = "pos_orchid_unigram.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -42,11 +43,13 @@ def _pud_tagger():
def _lst20_tagger():
global _LST20_TAGGER
- warnings.warn("""
+ warnings.warn(
+ """
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
- """)
+ """
+ )
if not _LST20_TAGGER:
path = get_corpus_path(_LST20_TAGGER_NAME)
with open(path, encoding="utf-8-sig") as fh:
@@ -84,6 +87,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
word_tags = _find_tag(words, _orchid_tagger())
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "lst20" or corpus == "lst20_ud":
+ dep_msg = deprecation_message(
+ [("corpus", "lst20"), ("corpus", "lst20_ud")],
+ "function `unigram.tag`",
+ "4.0.0",
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
words = lst20.pre_process(words)
word_tags = _find_tag(words, _lst20_tagger())
word_tags = lst20.post_process(word_tags, to_ud)
diff --git a/pythainlp/util/messages.py b/pythainlp/util/messages.py
new file mode 100644
index 000000000..f0f7eb1ae
--- /dev/null
+++ b/pythainlp/util/messages.py
@@ -0,0 +1,36 @@
+from typing import List, Tuple
+from warnings import warn
+
+
+def deprecation_message(
+ deprecated_items: List[Tuple[str, str]],
+ module_name: str,
+ last_effective_version: str,
+ recommended_action: str = "",
+):
+
+ dep_item_names = list(set([itm for itm, _ in deprecated_items]))
+ is_same_item = len(dep_item_names) == 1
+ if is_same_item:
+ single_item = len(deprecated_items) == 1
+ values = (
+ deprecated_items[0][1]
+ if single_item
+ else [val for _, val in deprecated_items]
+ )
+ dep_msg = f"{dep_item_names[0]}={repr(values)}"
+ else:
+ dep_msg = ", ".join(
+ [
+ f"{dep_item}={repr(dep_value)}"
+ for dep_item, dep_value in deprecated_items
+ ]
+ )
+
+ dep_msg += f" of {module_name}"
+ dep_msg += f" will be deprecated in version {last_effective_version}."
+
+ if recommended_action:
+ dep_msg += " " + recommended_action
+
+ return dep_msg
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 757df3fe0..bc00eabff 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -1,24 +1,25 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple, Union
import re
+import warnings
from transformers import (
CamembertTokenizer,
pipeline,
)
+from pythainlp.util.messages import deprecation_message
+
_model_name = "wangchanberta-base-att-spm-uncased"
_tokenizer = CamembertTokenizer.from_pretrained(
- f'airesearch/{_model_name}',
- revision='main')
+ f"airesearch/{_model_name}", revision="main"
+)
if _model_name == "wangchanberta-base-att-spm-uncased":
- _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>']
+ _tokenizer.additional_special_tokens = ["NOTUSED", "NOTUSED", "<_>"]
class ThaiNameTagger:
def __init__(
- self,
- dataset_name: str = "thainer",
- grouped_entities: bool = True
+ self, dataset_name: str = "thainer", grouped_entities: bool = True
):
"""
This function tags named-entitiy from text in IOB format.
@@ -31,23 +32,29 @@ def __init__(
* *lst20* - LST20 Corpus
:param bool grouped_entities: grouped entities
"""
+ if dataset_name == "lst20":
+ dep_msg = deprecation_message(
+ [("dataset_name", "lst20")], "class `ThaiNameTagger`", "4.0.0"
+ )
+ warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
self.dataset_name = dataset_name
self.grouped_entities = grouped_entities
self.classify_tokens = pipeline(
- task='ner',
+ task="ner",
tokenizer=_tokenizer,
- model=f'airesearch/{_model_name}',
- revision=f'finetuned@{self.dataset_name}-ner',
+ model=f"airesearch/{_model_name}",
+ revision=f"finetuned@{self.dataset_name}-ner",
ignore_labels=[],
- grouped_entities=self.grouped_entities)
+ grouped_entities=self.grouped_entities,
+ )
def _IOB(self, tag):
if tag != "O":
- return "B-"+tag
+ return "B-" + tag
return "O"
def _clear_tag(self, tag):
- return tag.replace('B-', '').replace('I-', '')
+ return tag.replace("B-", "").replace("I-", "")
def get_ner(
self, text: str, tag: bool = False
@@ -72,40 +79,41 @@ def get_ner(
if self.grouped_entities and self.dataset_name == "thainer":
self.sent_ner = [
(
- i['word'].replace("<_>", " ").replace('▁', ''),
- self._IOB(i['entity_group'])
- ) for i in self.json_ner
+ i["word"].replace("<_>", " ").replace("▁", ""),
+ self._IOB(i["entity_group"]),
+ )
+ for i in self.json_ner
]
elif self.dataset_name == "thainer":
self.sent_ner = [
- (
- i['word'].replace("<_>", " ").replace('▁', ''), i['entity']
- ) for i in self.json_ner if i['word'] != '▁'
+ (i["word"].replace("<_>", " ").replace("▁", ""), i["entity"])
+ for i in self.json_ner
+ if i["word"] != "▁"
]
elif self.grouped_entities and self.dataset_name == "lst20":
self.sent_ner = [
(
- i['word'].replace("<_>", " ").replace('▁', ''),
- i['entity_group'].replace('_', '-').replace('E-', 'I-')
- ) for i in self.json_ner
+ i["word"].replace("<_>", " ").replace("▁", ""),
+ i["entity_group"].replace("_", "-").replace("E-", "I-"),
+ )
+ for i in self.json_ner
]
else:
self.sent_ner = [
(
- i['word'].replace("<_>", " ").replace('▁', ''),
- i['entity'].replace('_', '-').replace('E-', 'I-')
- ) for i in self.json_ner
+ i["word"].replace("<_>", " ").replace("▁", ""),
+ i["entity"].replace("_", "-").replace("E-", "I-"),
+ )
+ for i in self.json_ner
]
- if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1:
+ if self.sent_ner[0][0] == "" and len(self.sent_ner) > 1:
self.sent_ner = self.sent_ner[1:]
for idx, (word, ner) in enumerate(self.sent_ner):
if idx > 0 and ner.startswith("B-"):
- if (
- self._clear_tag(ner) == self._clear_tag(
- self.sent_ner[idx-1][1]
- )
+ if self._clear_tag(ner) == self._clear_tag(
+ self.sent_ner[idx - 1][1]
):
- self.sent_ner[idx] = (word, ner.replace('B-', 'I-'))
+ self.sent_ner[idx] = (word, ner.replace("B-", "I-"))
if tag:
temp = ""
sent = ""