Skip to content

Commit 884daa8

Browse files
authored
Merge pull request #668 from PyThaiNLP/3.0
Update dev base from 3.0 base
2 parents b9a0330 + e3b33b5 commit 884daa8

File tree

5 files changed

+21
-9
lines changed

5 files changed

+21
-9
lines changed

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
__version__ = "3.0.6dev0"
2+
__version__ = "3.0.6"
33

44
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
55

pythainlp/tokenize/nercut.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def segment(
2525
"DATE",
2626
"TIME",
2727
],
28+
tagger=_thainer
2829
) -> List[str]:
2930
"""
3031
Dictionary-based maximal matching word segmentation, constrained with
@@ -33,18 +34,17 @@ def segment(
3334
3435
:param str text: text to be tokenized to words
3536
:parm list taglist: a list of named-entity tags to be used
37+
:parm class tagger: ner tagger engine
3638
:return: list of words, tokenized from the text
3739
"""
38-
if not text or not isinstance(text, str):
40+
if not isinstance(text, str):
3941
return []
4042

41-
global _thainer
42-
tagged_words = _thainer.tag(text, pos=False)
43+
tagged_words = tagger.tag(text, pos=False)
4344

4445
words = []
4546
combining_word = ""
46-
combining_word = ""
47-
for curr_word, curr_tag in tagged_words:
47+
for idx, (curr_word, curr_tag) in enumerate(tagged_words):
4848
if curr_tag != "O":
4949
tag = curr_tag[2:]
5050
else:
@@ -68,5 +68,15 @@ def segment(
6868
else:
6969
combining_word = ""
7070
words.append(curr_word)
71+
if idx + 1 == len(tagged_words):
72+
if (
73+
curr_tag.startswith("B-") or curr_tag == "O"
74+
) and combining_word != "":
75+
words.append(combining_word)
76+
combining_word = ""
77+
words.append(curr_word)
78+
else: # if tag is O
79+
combining_word += curr_word
80+
words.append(combining_word)
7181

7282
return words

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.0.5
2+
current_version = 3.0.6
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@
113113

114114
setup(
115115
name="pythainlp",
116-
version="3.0.6dev0",
116+
version="3.0.6",
117117
description="Thai Natural Language Processing library",
118118
long_description=readme,
119119
long_description_content_type="text/markdown",

tests/test_tokenize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,9 @@ def test_nercut(self):
610610
self.assertEqual(nercut.segment(None), [])
611611
self.assertEqual(nercut.segment(""), [])
612612
self.assertIsNotNone(nercut.segment("ทดสอบ"))
613-
self.assertIsNotNone(nercut.segment("ทดสอบ"))
613+
self.assertIsNotNone(nercut.segment("ทุ๊กกโคนน"))
614+
self.assertIsNotNone(nercut.segment("อือหือ"))
615+
self.assertIsNotNone(nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"))
614616
self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
615617

616618
def test_ssg(self):

0 commit comments

Comments
 (0)