From d1321b1a7edd7f46349df0d3fbe25a569601dc27 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Mar 2022 12:15:51 +0700 Subject: [PATCH 01/10] Save lst20ner --- pythainlp/tag/lst20ner.py | 48 +++++++++++++++++++++++++++++++++++++++ setup.py | 5 ++++ 2 files changed, 53 insertions(+) create mode 100644 pythainlp/tag/lst20ner.py diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py new file mode 100644 index 000000000..5ba41ab9f --- /dev/null +++ b/pythainlp/tag/lst20ner.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +from typing import List +import sentencepiece as spm +import numpy as np +from onnxruntime import ( + InferenceSession, SessionOptions, GraphOptimizationLevel +) + +class lst20ner_onnx: + def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: + self.options = SessionOptions() + self.options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + self.session = InferenceSession( + "onnx/model.onnx", sess_options=self.options, providers=providers + ) + self.session.disable_fallback() + self.outputs_name = self.session.get_outputs()[0].name + self.sp = spm.SentencePieceProcessor(model_file='cat-model/sentencepiece.bpe.model') + + def build_tokenizer(self, sent): + _t = [5]+[i+4 for i in self.sp.encode(sent)]+[6] + model_inputs = {} + model_inputs["input_ids"]=np.array([_t]) + model_inputs["attention_mask"]=np.array([[1]*len(_t)]) + return model_inputs + + def postprocess(self, logits_data): + logits_t = logits_data[0] + maxes = np.max(logits_t, axis=-1, keepdims=True) + shifted_exp = np.exp(logits_t - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + return scores + + def totag(self, post,sent): + tag= [] + _s=self.sp.EncodeAsPieces(sent) + for i in range(len(_s)): + tag.append( + ( + _s[i], + self.id2tag[str(list(post[i+1]).index(max(list(post[i+1]))))] + ) + ) + return tag + def get_ner(self, text: str): + _s=self.build_tokenizer(text) + logits = self.session.run(output_names=[self.outputs_name], input_feed=_s)[0] + return self.totag(self.postprocess(logits),text) \ No newline at end of file diff --git a/setup.py b/setup.py index 1dc67d129..979040edb 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,11 @@ "tltk": ["tltk>=1.3.8"], "oskut": ["oskut>=1.3"], "nlpo3": ["nlpo3>=1.2.2"], + "onnx": [ + "sentencepiece>=0.1.91", + "numpy>=1.16.1", + "onnxruntime>=1.10.0" + ], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", From 51c4659c136dc0751e8f1b54b7cfeeb359c746df Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Mar 2022 21:07:41 +0700 Subject: [PATCH 02/10] Add onnxruntime to full --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 979040edb..29bbf51b1 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,7 @@ "tltk>=1.3.8", "oskut>=1.3", "nlpo3>=1.2.2", + "onnxruntime>=1.10.0", ], } From f96b018072a54c0233a4eb8bea847c54e804d07f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Mar 2022 22:08:32 +0700 Subject: [PATCH 03/10] Add load file onnx --- pythainlp/corpus/__init__.py | 2 ++ pythainlp/corpus/core.py | 4 ++++ pythainlp/tag/lst20ner.py | 23 +++++++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 9a2d2c812..cef277a28 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -27,6 +27,7 @@ "thai_syllables", "thai_words", "path_pythainlp_corpus", + "get_path_folder_corpus", ] import os @@ -84,6 +85,7 @@ def corpus_db_path() -> str: get_corpus_db_detail, get_corpus_default_db, get_corpus_path, + get_path_folder_corpus, remove, path_pythainlp_corpus, ) # these imports must come before other pythainlp.corpus.* imports diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 74f976809..9e47b599f 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -522,3 +522,7 @@ def remove(name: str) -> bool: db.close() return False + + +def get_path_folder_corpus(name, version, *path): + return os.path.join(get_corpus_path(name, version), *path) \ No newline at end of file diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index 5ba41ab9f..428149df0 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -1,27 +1,34 @@ # -*- coding: utf-8 -*- from typing import List +import json import sentencepiece as spm import numpy as np from onnxruntime import ( InferenceSession, SessionOptions, GraphOptimizationLevel ) +from pythainlp.corpus import get_path_folder_corpus class lst20ner_onnx: def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: + self.model_name = "onnx_lst20ner" + self.model_version = "1.0" self.options = SessionOptions() self.options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL self.session = InferenceSession( - "onnx/model.onnx", sess_options=self.options, providers=providers + get_path_folder_corpus(self.model_name, self.model_version, "lst20-ner-model.onnx"), sess_options=self.options, providers=providers ) self.session.disable_fallback() self.outputs_name = self.session.get_outputs()[0].name - self.sp = spm.SentencePieceProcessor(model_file='cat-model/sentencepiece.bpe.model') - + self.sp = spm.SentencePieceProcessor(model_file=get_path_folder_corpus(self.model_name, self.model_version, "sentencepiece.bpe.model")) + with open(get_path_folder_corpus(self.model_name, self.model_version, "config.json"), encoding='utf-8-sig') as fh: + self._json = json.load(fh) + self.id2tag = self._json['id2label'] + def build_tokenizer(self, sent): _t = [5]+[i+4 for i in self.sp.encode(sent)]+[6] model_inputs = {} - model_inputs["input_ids"]=np.array([_t]) - model_inputs["attention_mask"]=np.array([[1]*len(_t)]) + model_inputs["input_ids"]=np.array([_t], dtype=np.int64) + model_inputs["attention_mask"]=np.array([[1]*len(_t)], dtype=np.int64) return model_inputs def postprocess(self, logits_data): @@ -43,6 +50,6 @@ def totag(self, post,sent): ) return tag def get_ner(self, text: str): - _s=self.build_tokenizer(text) - logits = self.session.run(output_names=[self.outputs_name], input_feed=_s)[0] - return self.totag(self.postprocess(logits),text) \ No newline at end of file + self._s=self.build_tokenizer(text) + logits = self.session.run(output_names=[self.outputs_name], input_feed=self._s)[0] + return self.totag(self.postprocess(logits), text) \ No newline at end of file From f472941069b8b830276a7ca2b38279a618f0b3ed Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 13:03:38 +0700 Subject: [PATCH 04/10] Update PEP8 --- pythainlp/corpus/core.py | 2 +- pythainlp/tag/lst20ner.py | 52 ++++++++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 9e47b599f..0196acdbb 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -525,4 +525,4 @@ def remove(name: str) -> bool: def get_path_folder_corpus(name, version, *path): - return os.path.join(get_corpus_path(name, version), *path) \ No newline at end of file + return os.path.join(get_corpus_path(name, version), *path) diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index 428149df0..be48d4e7b 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -8,6 +8,7 @@ ) from pythainlp.corpus import get_path_folder_corpus + class lst20ner_onnx: def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: self.model_name = "onnx_lst20ner" @@ -15,20 +16,41 @@ def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: self.options = SessionOptions() self.options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL self.session = InferenceSession( - get_path_folder_corpus(self.model_name, self.model_version, "lst20-ner-model.onnx"), sess_options=self.options, providers=providers + get_path_folder_corpus( + self.model_name, + self.model_version, + "lst20-ner-model.onnx" + ), + sess_options=self.options, + providers=providers ) self.session.disable_fallback() self.outputs_name = self.session.get_outputs()[0].name - self.sp = spm.SentencePieceProcessor(model_file=get_path_folder_corpus(self.model_name, self.model_version, "sentencepiece.bpe.model")) - with open(get_path_folder_corpus(self.model_name, self.model_version, "config.json"), encoding='utf-8-sig') as fh: + self.sp = spm.SentencePieceProcessor( + model_file=get_path_folder_corpus( + self.model_name, + self.model_version, + "sentencepiece.bpe.model" + ) + ) + with open( + get_path_folder_corpus( + self.model_name, + self.model_version, + "config.json" + ), + encoding='utf-8-sig' + ) as fh: self._json = json.load(fh) self.id2tag = self._json['id2label'] def build_tokenizer(self, sent): _t = [5]+[i+4 for i in self.sp.encode(sent)]+[6] model_inputs = {} - model_inputs["input_ids"]=np.array([_t], dtype=np.int64) - model_inputs["attention_mask"]=np.array([[1]*len(_t)], dtype=np.int64) + model_inputs["input_ids"] = np.array([_t], dtype=np.int64) + model_inputs["attention_mask"] = np.array( + [[1]*len(_t)], dtype=np.int64 + ) return model_inputs def postprocess(self, logits_data): @@ -37,19 +59,25 @@ def postprocess(self, logits_data): shifted_exp = np.exp(logits_t - maxes) scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) return scores - - def totag(self, post,sent): - tag= [] - _s=self.sp.EncodeAsPieces(sent) + + def totag(self, post, sent): + tag = [] + _s = self.sp.EncodeAsPieces(sent) for i in range(len(_s)): tag.append( ( _s[i], - self.id2tag[str(list(post[i+1]).index(max(list(post[i+1]))))] + self.id2tag[ + str(list(post[i+1]).index(max(list(post[i+1])))) + ] ) ) return tag + def get_ner(self, text: str): self._s=self.build_tokenizer(text) - logits = self.session.run(output_names=[self.outputs_name], input_feed=self._s)[0] - return self.totag(self.postprocess(logits), text) \ No newline at end of file + logits = self.session.run( + output_names=[self.outputs_name], + input_feed=self._s + )[0] + return self.totag(self.postprocess(logits), text) From bdfc57f52db5b0328a13c2d28d757970d3e3b08c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 13:14:14 +0700 Subject: [PATCH 05/10] Clean output from lst20ner_onnx --- pythainlp/tag/lst20ner.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index be48d4e7b..5f5d2017b 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -60,6 +60,16 @@ def postprocess(self, logits_data): scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) return scores + def clean_output(self, list_text): + new_list = [] + for i,j in list_text: + if i.startswith("▁") and i != '▁': + i = i.replace("▁", "", 1) + elif i == '▁': + i = " " + new_list.append((i, j)) + return new_list + def totag(self, post, sent): tag = [] _s = self.sp.EncodeAsPieces(sent) @@ -75,9 +85,9 @@ def totag(self, post, sent): return tag def get_ner(self, text: str): - self._s=self.build_tokenizer(text) + self._s = self.build_tokenizer(text) logits = self.session.run( output_names=[self.outputs_name], input_feed=self._s )[0] - return self.totag(self.postprocess(logits), text) + return self.clean_output(self.totag(self.postprocess(logits), text)) From e611444246c209bd8e4121f662f93b09f137f5b4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 13:14:54 +0700 Subject: [PATCH 06/10] Update PEP8 --- pythainlp/tag/lst20ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index 5f5d2017b..1f6a3ec89 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -62,7 +62,7 @@ def postprocess(self, logits_data): def clean_output(self, list_text): new_list = [] - for i,j in list_text: + for i, j in list_text: if i.startswith("▁") and i != '▁': i = i.replace("▁", "", 1) elif i == '▁': From fd972d6f9af4af25561ebd41f72eb4b118909a42 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 13:24:05 +0700 Subject: [PATCH 07/10] Move ONNX to class --- pythainlp/tag/lst20ner.py | 94 +++-------------------------- pythainlp/tag/wangchanberta_onnx.py | 92 ++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 87 deletions(-) create mode 100644 pythainlp/tag/wangchanberta_onnx.py diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index 1f6a3ec89..d72595fe6 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -1,93 +1,13 @@ # -*- coding: utf-8 -*- from typing import List -import json -import sentencepiece as spm -import numpy as np -from onnxruntime import ( - InferenceSession, SessionOptions, GraphOptimizationLevel -) -from pythainlp.corpus import get_path_folder_corpus +from pythainlp.tag.wangchanberta_onnx import WngchanBerta_ONNX -class lst20ner_onnx: +class LST20_ONNX(WngchanBerta_ONNX): def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: - self.model_name = "onnx_lst20ner" - self.model_version = "1.0" - self.options = SessionOptions() - self.options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL - self.session = InferenceSession( - get_path_folder_corpus( - self.model_name, - self.model_version, - "lst20-ner-model.onnx" - ), - sess_options=self.options, - providers=providers + WngchanBerta_ONNX.__init__( + self, + model_name="onnx_lst20ner", + model_version="1.0", + file_onnx="lst20-ner-model.onnx", providers=providers ) - self.session.disable_fallback() - self.outputs_name = self.session.get_outputs()[0].name - self.sp = spm.SentencePieceProcessor( - model_file=get_path_folder_corpus( - self.model_name, - self.model_version, - "sentencepiece.bpe.model" - ) - ) - with open( - get_path_folder_corpus( - self.model_name, - self.model_version, - "config.json" - ), - encoding='utf-8-sig' - ) as fh: - self._json = json.load(fh) - self.id2tag = self._json['id2label'] - - def build_tokenizer(self, sent): - _t = [5]+[i+4 for i in self.sp.encode(sent)]+[6] - model_inputs = {} - model_inputs["input_ids"] = np.array([_t], dtype=np.int64) - model_inputs["attention_mask"] = np.array( - [[1]*len(_t)], dtype=np.int64 - ) - return model_inputs - - def postprocess(self, logits_data): - logits_t = logits_data[0] - maxes = np.max(logits_t, axis=-1, keepdims=True) - shifted_exp = np.exp(logits_t - maxes) - scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) - return scores - - def clean_output(self, list_text): - new_list = [] - for i, j in list_text: - if i.startswith("▁") and i != '▁': - i = i.replace("▁", "", 1) - elif i == '▁': - i = " " - new_list.append((i, j)) - return new_list - - def totag(self, post, sent): - tag = [] - _s = self.sp.EncodeAsPieces(sent) - for i in range(len(_s)): - tag.append( - ( - _s[i], - self.id2tag[ - str(list(post[i+1]).index(max(list(post[i+1])))) - ] - ) - ) - return tag - - def get_ner(self, text: str): - self._s = self.build_tokenizer(text) - logits = self.session.run( - output_names=[self.outputs_name], - input_feed=self._s - )[0] - return self.clean_output(self.totag(self.postprocess(logits), text)) diff --git a/pythainlp/tag/wangchanberta_onnx.py b/pythainlp/tag/wangchanberta_onnx.py new file mode 100644 index 000000000..28d1dee17 --- /dev/null +++ b/pythainlp/tag/wangchanberta_onnx.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +from typing import List +import json +import sentencepiece as spm +import numpy as np +from onnxruntime import ( + InferenceSession, SessionOptions, GraphOptimizationLevel +) +from pythainlp.corpus import get_path_folder_corpus + +class WngchanBerta_ONNX: + def __init__(self, model_name: str, model_version:str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None: + self.model_name = model_name + self.model_version = model_version + self.options = SessionOptions() + self.options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + self.session = InferenceSession( + get_path_folder_corpus( + self.model_name, + self.model_version, + file_onnx + ), + sess_options=self.options, + providers=providers + ) + self.session.disable_fallback() + self.outputs_name = self.session.get_outputs()[0].name + self.sp = spm.SentencePieceProcessor( + model_file=get_path_folder_corpus( + self.model_name, + self.model_version, + "sentencepiece.bpe.model" + ) + ) + with open( + get_path_folder_corpus( + self.model_name, + self.model_version, + "config.json" + ), + encoding='utf-8-sig' + ) as fh: + self._json = json.load(fh) + self.id2tag = self._json['id2label'] + + def build_tokenizer(self, sent): + _t = [5]+[i+4 for i in self.sp.encode(sent)]+[6] + model_inputs = {} + model_inputs["input_ids"] = np.array([_t], dtype=np.int64) + model_inputs["attention_mask"] = np.array( + [[1]*len(_t)], dtype=np.int64 + ) + return model_inputs + + def postprocess(self, logits_data): + logits_t = logits_data[0] + maxes = np.max(logits_t, axis=-1, keepdims=True) + shifted_exp = np.exp(logits_t - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + return scores + + def clean_output(self, list_text): + new_list = [] + for i, j in list_text: + if i.startswith("▁") and i != '▁': + i = i.replace("▁", "", 1) + elif i == '▁': + i = " " + new_list.append((i, j)) + return new_list + + def totag(self, post, sent): + tag = [] + _s = self.sp.EncodeAsPieces(sent) + for i in range(len(_s)): + tag.append( + ( + _s[i], + self.id2tag[ + str(list(post[i+1]).index(max(list(post[i+1])))) + ] + ) + ) + return tag + + def get_ner(self, text: str): + self._s = self.build_tokenizer(text) + logits = self.session.run( + output_names=[self.outputs_name], + input_feed=self._s + )[0] + return self.clean_output(self.totag(self.postprocess(logits), text)) From dc11a9ba6e71c6b30f81c641d54f399532ecf8bd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 13:25:41 +0700 Subject: [PATCH 08/10] Fixed PEP8 --- pythainlp/tag/lst20ner.py | 3 ++- pythainlp/tag/wangchanberta_onnx.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py index d72595fe6..bda84bd27 100644 --- a/pythainlp/tag/lst20ner.py +++ b/pythainlp/tag/lst20ner.py @@ -9,5 +9,6 @@ def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: self, model_name="onnx_lst20ner", model_version="1.0", - file_onnx="lst20-ner-model.onnx", providers=providers + file_onnx="lst20-ner-model.onnx", + providers=providers ) diff --git a/pythainlp/tag/wangchanberta_onnx.py b/pythainlp/tag/wangchanberta_onnx.py index 28d1dee17..da8eeff14 100644 --- a/pythainlp/tag/wangchanberta_onnx.py +++ b/pythainlp/tag/wangchanberta_onnx.py @@ -8,8 +8,9 @@ ) from pythainlp.corpus import get_path_folder_corpus + class WngchanBerta_ONNX: - def __init__(self, model_name: str, model_version:str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None: + def __init__(self, model_name: str, model_version: str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None: self.model_name = model_name self.model_version = model_version self.options = SessionOptions() From 6efee29b27c8c8f0a6f873b3832a77c4e86a255b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 14:21:36 +0700 Subject: [PATCH 09/10] Add lst20_onnx to NER class --- pythainlp/tag/lst20_ner_onnx.py | 32 +++++++++++++++++++++++ pythainlp/tag/lst20ner.py | 14 ----------- pythainlp/tag/named_entity.py | 6 ++++- pythainlp/tag/wangchanberta_onnx.py | 39 +++++++++++++++++++++-------- tests/test_tag.py | 3 +++ 5 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 pythainlp/tag/lst20_ner_onnx.py delete mode 100644 pythainlp/tag/lst20ner.py diff --git a/pythainlp/tag/lst20_ner_onnx.py b/pythainlp/tag/lst20_ner_onnx.py new file mode 100644 index 000000000..27472e21e --- /dev/null +++ b/pythainlp/tag/lst20_ner_onnx.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from typing import List +from pythainlp.tag.wangchanberta_onnx import WngchanBerta_ONNX + + +class LST20_NER_ONNX(WngchanBerta_ONNX): + def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: + WngchanBerta_ONNX.__init__( + self, + model_name="onnx_lst20ner", + model_version="1.0", + file_onnx="lst20-ner-model.onnx", + providers=providers + ) + + def clean_output(self, list_text): + new_list = [] + if list_text[0][0] == "▁": + list_text = list_text[1:] + for i, j in list_text: + if i.startswith("▁") and i != '▁': + i = i.replace("▁", "", 1) + elif i == '▁': + i = " " + new_list.append((i, j)) + return list_text + + def _config(self, list_ner): + _n = [] + for i,j in list_ner: + _n.append((i,j.replace('E_', 'I_').replace('_', '-'))) + return _n diff --git a/pythainlp/tag/lst20ner.py b/pythainlp/tag/lst20ner.py deleted file mode 100644 index bda84bd27..000000000 --- a/pythainlp/tag/lst20ner.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- -from typing import List -from pythainlp.tag.wangchanberta_onnx import WngchanBerta_ONNX - - -class LST20_ONNX(WngchanBerta_ONNX): - def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None: - WngchanBerta_ONNX.__init__( - self, - model_name="onnx_lst20ner", - model_version="1.0", - file_onnx="lst20-ner-model.onnx", - providers=providers - ) diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 8a0f1b842..d745b4d3e 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -16,6 +16,7 @@ class NER: **Options for engine** * *thainer* - Thai NER engine * *wangchanberta* - wangchanberta model + * *lst20_onnx* - LST20 NER model by wangchanberta with ONNX runtime * *tltk* - wrapper for `TLTK `_. **Options for corpus** @@ -33,6 +34,9 @@ def load_engine(self, engine: str, corpus: str) -> None: if engine == "thainer" and corpus == "thainer": from pythainlp.tag.thainer import ThaiNameTagger self.engine = ThaiNameTagger() + elif engine == "lst20_onnx": + from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX + self.engine = LST20_NER_ONNX() elif engine == "wangchanberta": from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) @@ -88,7 +92,7 @@ def tag( """wangchanberta is not support part-of-speech tag. It have not part-of-speech tag in output.""" ) - if self.name_engine == "wangchanberta": + if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx": return self.engine.get_ner(text, tag=tag) else: return self.engine.get_ner(text, tag=tag, pos=pos) diff --git a/pythainlp/tag/wangchanberta_onnx.py b/pythainlp/tag/wangchanberta_onnx.py index da8eeff14..ae619918d 100644 --- a/pythainlp/tag/wangchanberta_onnx.py +++ b/pythainlp/tag/wangchanberta_onnx.py @@ -61,14 +61,7 @@ def postprocess(self, logits_data): return scores def clean_output(self, list_text): - new_list = [] - for i, j in list_text: - if i.startswith("▁") and i != '▁': - i = i.replace("▁", "", 1) - elif i == '▁': - i = " " - new_list.append((i, j)) - return new_list + return list_text def totag(self, post, sent): tag = [] @@ -84,10 +77,36 @@ def totag(self, post, sent): ) return tag - def get_ner(self, text: str): + def _config(self, list_ner): + return list_ner + + def get_ner(self, text: str, tag: bool = False): self._s = self.build_tokenizer(text) logits = self.session.run( output_names=[self.outputs_name], input_feed=self._s )[0] - return self.clean_output(self.totag(self.postprocess(logits), text)) + _tag = self.clean_output(self.totag(self.postprocess(logits), text)) + if tag: + _tag = self._config(_tag) + temp = "" + sent = "" + for idx, (word, ner) in enumerate(_tag): + if ner.startswith("B-") and temp != "": + sent += "" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "" + temp = "" + sent += word + + if idx == len(_tag) - 1 and temp != "": + sent += "" + + return sent + else: + return _tag diff --git a/tests/test_tag.py b/tests/test_tag.py index 536f65eae..63e887747 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -361,6 +361,9 @@ def test_NER_class(self): self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า")) self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", pos=False)) self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", tag=True)) + ner = NER(engine="lst20_onnx") + self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า")) + self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", tag=True)) ner = NER(engine="tltk") self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า")) self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", pos=False)) From 60fcd9643c33c74baf5b10a5e7fd9795ecd72d0a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 26 Apr 2022 14:25:29 +0700 Subject: [PATCH 10/10] Fixed wrong code --- pythainlp/tag/lst20_ner_onnx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/tag/lst20_ner_onnx.py b/pythainlp/tag/lst20_ner_onnx.py index 27472e21e..eaa7f2ab7 100644 --- a/pythainlp/tag/lst20_ner_onnx.py +++ b/pythainlp/tag/lst20_ner_onnx.py @@ -23,10 +23,10 @@ def clean_output(self, list_text): elif i == '▁': i = " " new_list.append((i, j)) - return list_text + return new_list def _config(self, list_ner): _n = [] - for i,j in list_ner: - _n.append((i,j.replace('E_', 'I_').replace('_', '-'))) + for i, j in list_ner: + _n.append((i, j.replace('E_', 'I_').replace('_', '-'))) return _n