@@ -25,6 +25,7 @@ def segment(
2525 "DATE" ,
2626 "TIME" ,
2727 ],
28+ tagger = _thainer
2829) -> List [str ]:
2930 """
3031 Dictionary-based maximal matching word segmentation, constrained with
@@ -33,18 +34,17 @@ def segment(
3334
3435 :param str text: text to be tokenized to words
3536 :parm list taglist: a list of named-entity tags to be used
37+ :parm class tagger: ner tagger engine
3638 :return: list of words, tokenized from the text
3739 """
38- if not text or not isinstance (text , str ):
40+ if not isinstance (text , str ):
3941 return []
4042
41- global _thainer
42- tagged_words = _thainer .tag (text , pos = False )
43+ tagged_words = tagger .tag (text , pos = False )
4344
4445 words = []
4546 combining_word = ""
46- combining_word = ""
47- for curr_word , curr_tag in tagged_words :
47+ for idx , (curr_word , curr_tag ) in enumerate (tagged_words ):
4848 if curr_tag != "O" :
4949 tag = curr_tag [2 :]
5050 else :
@@ -68,5 +68,15 @@ def segment(
6868 else :
6969 combining_word = ""
7070 words .append (curr_word )
71+ if idx + 1 == len (tagged_words ):
72+ if (
73+ curr_tag .startswith ("B-" ) or curr_tag == "O"
74+ ) and combining_word != "" :
75+ words .append (combining_word )
76+ combining_word = ""
77+ words .append (curr_word )
78+ else : # if tag is O
79+ combining_word += curr_word
80+ words .append (combining_word )
7181
7282 return words
0 commit comments