55    CamembertTokenizer ,
66    pipeline ,
77)
8+ import  warnings 
9+ from  pythainlp .tokenize  import  word_tokenize 
810
911_model_name  =  "wangchanberta-base-att-spm-uncased" 
1012_tokenizer  =  CamembertTokenizer .from_pretrained (
@@ -48,7 +50,7 @@ def _clear_tag(self, tag):
4850        return  tag .replace ("B-" , "" ).replace ("I-" , "" )
4951
5052    def  get_ner (
51-         self , text : str , tag : bool  =  False 
53+         self , text : str , pos :  bool =   False , tag : bool  =  False 
5254    ) ->  Union [List [Tuple [str , str ]], str ]:
5355        """ 
5456        This function tags named-entitiy from text in IOB format. 
@@ -64,6 +66,8 @@ def get_ner(
6466                  word and NER tag
6567        :rtype: Union[list[tuple[str, str]]], str 
6668        """ 
69+         if  pos :
70+             warnings .warn ("This model doesn't support output postag and It doesn't output the postag." )
6771        text  =  re .sub (" " , "<_>" , text )
6872        self .json_ner  =  self .classify_tokens (text )
6973        self .output  =  "" 
@@ -121,6 +125,86 @@ def get_ner(
121125            return  self .sent_ner 
122126
123127
128+ class  NamedEntityRecognition :
129+     def  __init__ (self , model : str  = "pythainlp/thainer-corpus-v2-base-model" ) ->  None :
130+         """ 
131+         This function tags named-entitiy from text in IOB format. 
132+ 
133+         Powered by wangchanberta from VISTEC-depa\  
134+               AI Research Institute of Thailand
135+         :param str model: The model that use wangchanberta pretrained. 
136+         """ 
137+         from  transformers  import  AutoTokenizer 
138+         from  transformers  import  AutoModelForTokenClassification 
139+         self .tokenizer  =  AutoTokenizer .from_pretrained (model )
140+         self .model  =  AutoModelForTokenClassification .from_pretrained (model )
141+     def  _fix_span_error (self , words , ner ):
142+         _ner  =  []
143+         _ner = ner 
144+         _new_tag = []
145+         for  i ,j  in  zip (words ,_ner ):
146+             i = self .tokenizer .decode (i )
147+             if  i .isspace () and  j .startswith ("B-" ):
148+                 j = "O" 
149+             if  i == ''  or  i == '<s>'  or  i == '</s>' :
150+                 continue 
151+             if  i == "<_>" :
152+                 i = " " 
153+             _new_tag .append ((i ,j ))
154+         return  _new_tag 
155+     def  get_ner (
156+         self , text : str , pos : bool =  False ,tag : bool  =  False 
157+     ) ->  Union [List [Tuple [str , str ]], str ]:
158+         """ 
159+         This function tags named-entitiy from text in IOB format. 
160+         Powered by wangchanberta from VISTEC-depa\  
161+               AI Research Institute of Thailand
162+ 
163+         :param str text: text in Thai to be tagged 
164+         :param bool tag: output like html tag. 
165+         :return: a list of tuple associated with tokenized word group, NER tag, \  
166+                   and output like html tag (if the parameter `tag` is \ 
167+                   specified as `True`). \ 
168+                   Otherwise, return a list of tuple associated with tokenized \ 
169+                   word and NER tag
170+         :rtype: Union[list[tuple[str, str]]], str 
171+         """ 
172+         import  torch 
173+         if  pos :
174+             warnings .warn ("This model doesn't support output postag and It doesn't output the postag." )
175+         words_token  =  word_tokenize (text .replace (" " , "<_>" ))
176+         inputs = self .tokenizer (words_token ,is_split_into_words = True ,return_tensors = "pt" )
177+         ids  =  inputs ["input_ids" ]
178+         mask  =  inputs ["attention_mask" ]
179+         # forward pass 
180+         outputs  =  self .model (ids , attention_mask = mask )
181+         logits  =  outputs [0 ]
182+         predictions  =  torch .argmax (logits , dim = 2 )
183+         predicted_token_class  =  [self .model .config .id2label [t .item ()] for  t  in  predictions [0 ]]
184+         ner_tag = self ._fix_span_error (inputs ['input_ids' ][0 ],predicted_token_class )
185+         if  tag :
186+             temp  =  "" 
187+             sent  =  "" 
188+             for  idx , (word , ner ) in  enumerate (ner_tag ):
189+                 if  ner .startswith ("B-" ) and  temp  !=  "" :
190+                     sent  +=  "</"  +  temp  +  ">" 
191+                     temp  =  ner [2 :]
192+                     sent  +=  "<"  +  temp  +  ">" 
193+                 elif  ner .startswith ("B-" ):
194+                     temp  =  ner [2 :]
195+                     sent  +=  "<"  +  temp  +  ">" 
196+                 elif  ner  ==  "O"  and  temp  !=  "" :
197+                     sent  +=  "</"  +  temp  +  ">" 
198+                     temp  =  "" 
199+                 sent  +=  word 
200+ 
201+                 if  idx  ==  len (ner_tag ) -  1  and  temp  !=  "" :
202+                     sent  +=  "</"  +  temp  +  ">" 
203+ 
204+             return  sent 
205+         return  ner_tag 
206+ 
207+ 
124208def  segment (text : str ) ->  List [str ]:
125209    """ 
126210    Subword tokenize. SentencePiece from wangchanberta model. 
0 commit comments