diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index e83d0bb72..183b05790 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -__version__ = "3.0.7" +__version__ = "3.0.8" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index 27e4f023e..11f6e85b6 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -65,18 +65,15 @@ def segment( words.append(combining_word) combining_word = "" words.append(curr_word) - else: + else: # if tag is O combining_word = "" words.append(curr_word) if idx + 1 == len(tagged_words): - if ( - curr_tag.startswith("B-") or curr_tag == "O" - ) and combining_word != "": + if curr_tag.startswith("B-") and combining_word != "": words.append(combining_word) - combining_word = "" - words.append(curr_word) - else: # if tag is O - combining_word += curr_word + elif curr_tag.startswith("I-") and combining_word != "": words.append(combining_word) + else: + pass return words diff --git a/setup.cfg b/setup.cfg index 8d7a9fbf1..bea023df1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.7 +current_version = 3.0.8 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index d65f4bfaa..f001f7682 100644 --- a/setup.py +++ b/setup.py @@ -105,7 +105,7 @@ setup( name="pythainlp", - version="3.0.7", + version="3.0.8", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 186cd5c4b..2ac282156 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -610,9 +610,14 @@ def test_nercut(self): self.assertEqual(nercut.segment(None), []) self.assertEqual(nercut.segment(""), []) self.assertIsNotNone(nercut.segment("ทดสอบ")) - self.assertIsNotNone(nercut.segment("ทุ๊กกโคนน")) - self.assertIsNotNone(nercut.segment("อือหือ")) - self.assertIsNotNone(nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ")) + self.assertEqual(nercut.segment("ทันแน่ๆ"), ['ทัน', 'แน่ๆ']) + self.assertEqual(nercut.segment("%1ครั้ง"), ['%', '1', 'ครั้ง']) + self.assertEqual(nercut.segment("ทุ๊กกโคนน"), ['ทุ๊กกโคนน']) + self.assertEqual(nercut.segment("อือหือ"), ['อือหือ']) + self.assertEqual( + nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"), + ['อย่าลืมอัพการ์ดนะจ๊ะ'] + ) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) def test_ssg(self):