diff --git a/docker_requirements.txt b/docker_requirements.txt index 4b7e3da72..9b531e197 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -28,3 +28,4 @@ pandas==0.24 tltk==1.3.8 OSKut==1.3 nlpo3==1.2.2 +thai-nner==0.3 diff --git a/docs/api/tag.rst b/docs/api/tag.rst index 8c544c0fc..e8769a225 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -232,6 +232,8 @@ Modules .. autofunction:: chunk_parse .. autoclass:: NER :members: +.. autoclass:: NNER + :members: .. autoclass:: pythainlp.tag.thainer.ThaiNameTagger :members: get_ner .. autofunction:: pythainlp.tag.tltk.get_ner diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 71ca90df6..23275858d 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -13,10 +13,11 @@ "tag_provinces", "chunk_parse", "NER", + "NNER", ] from pythainlp.tag.locations import tag_provinces from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents from pythainlp.tag._tag_perceptron import PerceptronTagger from pythainlp.tag.chunk import chunk_parse -from pythainlp.tag.named_entity import NER +from pythainlp.tag.named_entity import NER, NNER diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index d745b4d3e..c05cea498 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -96,3 +96,71 @@ def tag( return self.engine.get_ner(text, tag=tag) else: return self.engine.get_ner(text, tag=tag, pos=pos) + + +class NNER: + """ + Nested Named Entity Recognition + + :param str engine: Nested Named entity recognizer engine + :param str corpus: corpus + + **Options for engine** + * *thai_nner* - Thai NER engine + """ + def __init__(self, engine: str = "thai_nner") -> None: + self.load_engine(engine) + + def load_engine(self, engine: str = "thai_nner") -> None: + from pythainlp.tag.thai_nner import Thai_NNER + self.engine = Thai_NNER() + + def tag(self, text) -> Tuple[List[str], List[dict]]: + """ + This function tags nested named-entitiy. + + :param str text: text in Thai to be tagged + + :return: a list of tuple associated with tokenized word, NNER tag. + :rtype: Tuple[List[str], List[dict]] + + :Example: + + >>> from pythainlp.tag.named_entity import NNER + >>> nner = NNER() + >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า") + ([ + '', + '', + 'แมว', + 'ทํา', + '', + 'อะไร', + 'ตอน', + '', + 'ห้า', + '', + 'โมง', + '', + 'เช้า', + '' + ], + [ + { + 'text': ['', 'ห้า'], + 'span': [7, 9], + 'entity_type': 'cardinal' + }, + { + 'text': ['', 'ห้า', '', 'โมง'], + 'span': [7, 11], + 'entity_type': 'time' + }, + { + 'text': ['', 'โมง'], + 'span': [9, 11], + 'entity_type': 'unit' + } + ]) + """ + return self.engine.tag(text) diff --git a/pythainlp/tag/thai_nner.py b/pythainlp/tag/thai_nner.py new file mode 100644 index 000000000..bc79048f0 --- /dev/null +++ b/pythainlp/tag/thai_nner.py @@ -0,0 +1,14 @@ +from typing import List, Tuple +from thai_nner import NNER +from pythainlp.corpus import get_corpus_path + + +class Thai_NNER: + def __init__( + self, + path_model=get_corpus_path('thai_nner', '1.0') + ) -> None: + self.model = NNER(path_model=path_model) + + def tag(self, text) -> Tuple[List[str], List[dict]]: + return self.model.get_tag(text) diff --git a/setup.py b/setup.py index 488ea8eaf..37c411872 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ "numpy>=1.16.1", "onnxruntime>=1.10.0" ], + "thai_nner": ["thai_nner"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -106,6 +107,7 @@ "oskut>=1.3", "nlpo3>=1.2.2", "onnxruntime>=1.10.0", + "thai_nner" ], } diff --git a/tests/test_tag.py b/tests/test_tag.py index 63e887747..90d3f0fc7 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -14,6 +14,7 @@ unigram, tltk, NER, + NNER, ) from pythainlp.tag.locations import tag_provinces from pythainlp.tag.thainer import ThaiNameTagger @@ -370,3 +371,7 @@ def test_NER_class(self): self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", tag=True)) with self.assertRaises(ValueError): NER(engine="thainer", corpus="cat") + + def test_NNER_class(self): + nner = NNER() + self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า"))