Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@
"chunk_parse",
"NER",
"NNER",
"pos_tag_transformers"
]

from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
from pythainlp.tag._tag_perceptron import PerceptronTagger
from pythainlp.tag.chunk import chunk_parse
from pythainlp.tag.named_entity import NER, NNER
45 changes: 45 additions & 0 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import List, Tuple



def pos_tag(
words: List[str], engine: str = "perceptron", corpus: str = "orchid"
) -> List[Tuple[str, str]]:
Expand Down Expand Up @@ -176,3 +177,47 @@ def pos_tag_sents(
return []

return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]


def pos_tag_transformers(
words: str, engine: str = "bert-base-th-cased-blackboard"
):
"""
"wangchanberta-ud-thai-pud-upos",
"mdeberta-v3-ud-thai-pud-upos",
"bert-base-th-cased-blackboard",

"""

try:
from transformers import AutoModelForTokenClassification, \
AutoTokenizer, TokenClassificationPipeline
except ImportError:
raise ImportError(
"Not found transformers! Please install transformers by pip install transformers")

if not words:
return []

if engine == "wangchanberta-ud-thai-pud-upos":
model = AutoModelForTokenClassification.from_pretrained(
"Pavarissy/wangchanberta-ud-thai-pud-upos")
tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos")
elif engine == "mdeberta-v3-ud-thai-pud-upos":
model = AutoModelForTokenClassification.from_pretrained(
"Pavarissy/mdeberta-v3-ud-thai-pud-upos")
tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos")
elif engine == "bert-base-th-cased-blackboard":
model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai")
tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai")
else:
raise ValueError(
"pos_tag_transformers not support {0} engine.".format(
engine
)
)

pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True)

outputs = pipeline(words)
return outputs
12 changes: 12 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
perceptron,
pos_tag,
pos_tag_sents,
pos_tag_transformers,
unigram,
tltk,
NER,
NNER,

)
from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.thainer import ThaiNameTagger
Expand Down Expand Up @@ -362,3 +364,13 @@ def test_NER_class(self):
def test_NNER_class(self):
nner = NNER()
self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า"))

def test_pos_tag_transformers(self):
self.assertIsNotNone(pos_tag_transformers(
words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard"))
self.assertIsNotNone(pos_tag_transformers(
words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos"))
self.assertIsNotNone(pos_tag_transformers(
words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos"))
with self.assertRaises(ValueError):
pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine")