Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions pythainlp/augment/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
"WordNetAug",
"postype2wordnet",
]

import warnings
from pythainlp.corpus import wordnet
from collections import OrderedDict
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from pythainlp.util.messages import deprecation_message
from typing import List
from nltk.corpus import wordnet as wn
import itertools
Expand Down Expand Up @@ -127,9 +128,15 @@ def postype2wordnet(pos: str, corpus: str):
* *lst20* - LST20 Corpus
* *orchid* - Orchid Corpus
"""
if corpus not in ['lst20', 'orchid']:
if corpus not in ["lst20", "orchid"]:
return None
if corpus == 'lst20':
if corpus == "lst20":
dep_msg = deprecation_message(
[("corpus", "lst20")],
"function `wordnet.postype2wordnet`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
return lst20[pos]
else:
return orchid[pos]
Expand All @@ -139,14 +146,12 @@ class WordNetAug:
"""
Text Augment using wordnet
"""

def __init__(self):
pass

def find_synonyms(
self,
word: str,
pos: str = None,
postag_corpus: str = "lst20"
self, word: str, pos: str = None, postag_corpus: str = "lst20"
) -> List[str]:
"""
Find synonyms from wordnet
Expand All @@ -162,13 +167,13 @@ def find_synonyms(
self.list_synsets = wordnet.synsets(word)
else:
self.p2w_pos = postype2wordnet(pos, postag_corpus)
if self.p2w_pos != '':
if self.p2w_pos != "":
self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
else:
self.list_synsets = wordnet.synsets(word)

for self.synset in wordnet.synsets(word):
for self.syn in self.synset.lemma_names(lang='tha'):
for self.syn in self.synset.lemma_names(lang="tha"):
self.synonyms.append(self.syn)

self.synonyms_without_duplicates = list(
Expand All @@ -182,7 +187,7 @@ def augment(
tokenize: object = word_tokenize,
max_syn_sent: int = 6,
postag: bool = True,
postag_corpus: str = "lst20"
postag_corpus: str = "lst20",
) -> List[List[str]]:
"""
Text Augment using wordnet
Expand Down Expand Up @@ -210,10 +215,19 @@ def augment(
('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
"""
if postag_corpus.startswith("lst20"):
dep_msg = deprecation_message(
[("postag_corpus", "lst20")],
"method `WordNetAug.augment`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)

new_sentences = []
self.list_words = tokenize(sentence)
self.list_synonym = []
self.p_all = 1

if postag:
self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
for word, pos in self.list_pos:
Expand Down
37 changes: 27 additions & 10 deletions pythainlp/tag/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import warnings
from typing import List, Tuple, Union

from pythainlp.util.messages import deprecation_message


class NER:
"""
Expand All @@ -30,43 +32,53 @@ class NER:

**Note**: for tltk engine, It's support ner model from tltk only.
"""

def __init__(self, engine: str, corpus: str = "thainer") -> None:
if any([arg.startswith("lst20") for arg in (engine, corpus)]):
dep_msg = deprecation_message(
[("engine", "lst20_onnx"), ("corpus", "lst20")],
"`named_entity.NER`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
self.load_engine(engine=engine, corpus=corpus)

def load_engine(self, engine: str, corpus: str) -> None:
self.name_engine = engine
self.engine = None
if engine == "thainer" and corpus == "thainer":
from pythainlp.tag.thainer import ThaiNameTagger

self.engine = ThaiNameTagger()
elif engine == "lst20_onnx":
from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX

self.engine = LST20_NER_ONNX()
elif engine == "wangchanberta":
from pythainlp.wangchanberta import ThaiNameTagger
if corpus=="lst20":
warnings.warn("""

if corpus == "lst20":
warnings.warn(
"""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
"""
)
self.engine = ThaiNameTagger(dataset_name=corpus)
elif engine == "tltk":
from pythainlp.tag import tltk

self.engine = tltk
else:
raise ValueError(
"NER class not support {0} engine or {1} corpus.".format(
engine,
corpus
engine, corpus
)
)

def tag(
self,
text,
pos=True,
tag=False
self, text, pos=True, tag=False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named-entitiy from text in IOB format.
Expand Down Expand Up @@ -103,7 +115,10 @@ def tag(
"""wangchanberta is not support part-of-speech tag.
It have not part-of-speech tag in output."""
)
if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx":
if (
self.name_engine == "wangchanberta"
or self.name_engine == "lst20_onnx"
):
return self.engine.get_ner(text, tag=tag)
else:
return self.engine.get_ner(text, tag=tag, pos=pos)
Expand All @@ -119,11 +134,13 @@ class NNER:
**Options for engine**
* *thai_nner* - Thai NER engine
"""

def __init__(self, engine: str = "thai_nner") -> None:
self.load_engine(engine)

def load_engine(self, engine: str = "thai_nner") -> None:
from pythainlp.tag.thai_nner import Thai_NNER

self.engine = Thai_NNER()

def tag(self, text) -> Tuple[List[str], List[dict]]:
Expand Down
13 changes: 11 additions & 2 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import PerceptronTagger, lst20, orchid
from pythainlp.util.messages import deprecation_message

_ORCHID_FILENAME = "pos_orchid_perceptron.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
Expand Down Expand Up @@ -38,11 +39,13 @@ def _pud_tagger():

def _lst20_tagger():
global _LST20_TAGGER
warnings.warn("""
warnings.warn(
"""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
"""
)
if not _LST20_TAGGER:
path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
_LST20_TAGGER = PerceptronTagger(path=path)
Expand All @@ -69,6 +72,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
word_tags = _orchid_tagger().tag(words)
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "lst20" or corpus == "lst20_ud":
dep_msg = deprecation_message(
[("postag_corpus", "lst20"), ("postag_corpus", "lst20_ud")],
"function `perceptron.tag`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
words = lst20.pre_process(words)
word_tags = _lst20_tagger().tag(words)
word_tags = lst20.post_process(word_tags, to_ud)
Expand Down
29 changes: 23 additions & 6 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple
import warnings

from pythainlp.util.messages import deprecation_message


def pos_tag(
words: List[str],
engine: str = "perceptron",
corpus: str = "orchid"
words: List[str], engine: str = "perceptron", corpus: str = "orchid"
) -> List[Tuple[str, str]]:
"""
Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'.
Expand Down Expand Up @@ -98,21 +99,29 @@ def pos_tag(

_support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]

if corpus.startswith("lst20"):
dep_msg = deprecation_message(
[("corpus", "lst20"), ("corpus", "lst20_ud")],
"function `pos_tag.pos_tag`",
"4.0.0",
)

if engine == "perceptron" and corpus in _support_corpus:
from pythainlp.tag.perceptron import tag as tag_
elif engine == "wangchanberta" and corpus == "lst20":
from pythainlp.wangchanberta.postag import pos_tag as tag_
words = ''.join(words)

words = "".join(words)
elif engine == "tltk":
from pythainlp.tag.tltk import pos_tag as tag_

corpus = "tnc"
elif engine == "unigram" and corpus in _support_corpus: # default
from pythainlp.tag.unigram import tag as tag_
else:
raise ValueError(
"pos_tag not support {0} engine or {1} corpus.".format(
engine,
corpus
engine, corpus
)
)

Expand Down Expand Up @@ -169,4 +178,12 @@ def pos_tag_sents(
if not sentences:
return []

if corpus.startswith("lst20"):
dep_msg = deprecation_message(
[("corpus", "lst20"), ("corpus", "lst20_ud")],
"function `pos_tag.pos_tag_sents`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)

return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
13 changes: 11 additions & 2 deletions pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import lst20, orchid
from pythainlp.util.messages import deprecation_message

_ORCHID_FILENAME = "pos_orchid_unigram.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
Expand Down Expand Up @@ -42,11 +43,13 @@ def _pud_tagger():

def _lst20_tagger():
global _LST20_TAGGER
warnings.warn("""
warnings.warn(
"""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
"""
)
if not _LST20_TAGGER:
path = get_corpus_path(_LST20_TAGGER_NAME)
with open(path, encoding="utf-8-sig") as fh:
Expand Down Expand Up @@ -84,6 +87,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
word_tags = _find_tag(words, _orchid_tagger())
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "lst20" or corpus == "lst20_ud":
dep_msg = deprecation_message(
[("corpus", "lst20"), ("corpus", "lst20_ud")],
"function `unigram.tag`",
"4.0.0",
)
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
words = lst20.pre_process(words)
word_tags = _find_tag(words, _lst20_tagger())
word_tags = lst20.post_process(word_tags, to_ud)
Expand Down
36 changes: 36 additions & 0 deletions pythainlp/util/messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import List, Tuple
from warnings import warn


def deprecation_message(
deprecated_items: List[Tuple[str, str]],
module_name: str,
last_effective_version: str,
recommended_action: str = "",
):

dep_item_names = list(set([itm for itm, _ in deprecated_items]))
is_same_item = len(dep_item_names) == 1
if is_same_item:
single_item = len(deprecated_items) == 1
values = (
deprecated_items[0][1]
if single_item
else [val for _, val in deprecated_items]
)
dep_msg = f"{dep_item_names[0]}={repr(values)}"
else:
dep_msg = ", ".join(
[
f"{dep_item}={repr(dep_value)}"
for dep_item, dep_value in deprecated_items
]
)

dep_msg += f" of {module_name}"
dep_msg += f" will be deprecated in version {last_effective_version}."

if recommended_action:
dep_msg += " " + recommended_action

return dep_msg
Loading