Skip to content

Commit e5ec352

Browse files
authored
Merge pull request #599 from PyThaiNLP/add-tltk
Add tltk
2 parents 2e9ac04 + f7d99eb commit e5ec352

File tree

17 files changed

+326
-9
lines changed

17 files changed

+326
-9
lines changed

docker_requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ fairseq==0.10.2
2323
pyicu==2.6
2424
deepcut==0.7.0.0
2525
h5py==2.10.0
26-
tensorflow==2.4.2
27-
pandas==0.24
26+
tensorflow==2.4.0
27+
pandas==0.24
28+
tltk==1.3.8

docs/api/tag.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ Modules
232232
.. autofunction:: chunk_parse
233233
.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
234234
:members: get_ner
235+
.. autofunction:: pythainlp.tag.tltk.get_ner
235236

236237
Tagger Engines
237238
--------------

docs/notes/installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ where ``extras`` can be
2727
- ``mt5`` (to mt5 models for Thai text summarizer)
2828
- ``wordnet`` (to support wordnet)
2929
- ``spell`` (to support phunspell & symspellpy)
30+
- ``tltk`` (to support tltk)
3031
- ``full`` (install everything)
3132

3233
For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

pythainlp/spell/core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def spell(word: str, engine: str = "pn") -> List[str]:
2222
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
2323
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
2424
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
25+
* *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.,
2526
2627
:return: list of possible correct words within 1 or 2 edit distance and
2728
sorted by frequency of word occurrences in the spelling dictionary
@@ -39,6 +40,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
3940
spell("เส้นตรบ")
4041
# output: ['เส้นตรง']
4142
43+
spell("เส้นตรบ", engine="tltk")
44+
# output: ['เส้นตรง']
45+
4246
spell("ครัช")
4347
# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส',
4448
# 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด']
@@ -58,6 +62,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
5862
elif engine == "symspellpy":
5963
from pythainlp.spell.symspellpy import spell as SPELL_CHECKER
6064
text_correct = SPELL_CHECKER(word)
65+
elif engine == "tltk":
66+
from pythainlp.spell.tltk import spell as SPELL_CHECKER
67+
text_correct = SPELL_CHECKER(word)
6168
else:
6269
text_correct = DEFAULT_SPELL_CHECKER.spell(word)
6370

pythainlp/spell/tltk.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from tltk.nlp import spell_candidates
2+
from typing import List
3+
4+
5+
def spell(text: str) -> List[str]:
6+
return spell_candidates(text)

pythainlp/tag/pos_tag.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def pos_tag(
1515
* *wangchanberta* - wangchanberta model (support lst20 corpus only \
1616
and it supports a string only. if you input a list of word, \
1717
it will convert list word to a string.
18+
* *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
19+
if you choose other corpus, It's change to TNC corpus.)
1820
:param str corpus:
1921
the corpus that used to create the language model for tagger
2022
* *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
@@ -28,6 +30,7 @@ def pos_tag(
2830
* *pud* - `Parallel Universal Dependencies (PUD)\
2931
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
3032
treebanks, natively use Universal POS tags
33+
* *tnc* - Thai National Corpus (support tltk engine only)
3134
:return: a list of tuples (word, POS tag)
3235
:rtype: list[tuple[str, str]]
3336
@@ -89,13 +92,25 @@ def pos_tag(
8992
if not words:
9093
return []
9194

92-
if engine == "perceptron":
95+
_support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]
96+
97+
if engine == "perceptron" and corpus in _support_corpus:
9398
from pythainlp.tag.perceptron import tag as tag_
9499
elif engine == "wangchanberta" and corpus == "lst20":
95100
from pythainlp.wangchanberta.postag import pos_tag as tag_
96101
words = ''.join(words)
97-
else: # default, use "unigram" ("old") engine
102+
elif engine == "tltk":
103+
from pythainlp.tag.tltk import pos_tag as tag_
104+
corpus = "tnc"
105+
elif engine == "unigram" and corpus in _support_corpus: # default
98106
from pythainlp.tag.unigram import tag as tag_
107+
else:
108+
raise ValueError(
109+
"pos_tag not support {0} engine or {1} corpus.".format(
110+
engine,
111+
corpus
112+
)
113+
)
99114

100115
word_tags = tag_(words, corpus=corpus)
101116

@@ -114,6 +129,9 @@ def pos_tag_sents(
114129
:param str engine:
115130
* *perceptron* - perceptron tagger (default)
116131
* *unigram* - unigram tagger
132+
* *wangchanberta* - wangchanberta model (support lst20 corpus only)
133+
* *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
134+
if you choose other corpus, It's change to TNC corpus.)
117135
:param str corpus:
118136
the corpus that used to create the language model for tagger
119137
* *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
@@ -127,6 +145,7 @@ def pos_tag_sents(
127145
* *pud* - `Parallel Universal Dependencies (PUD)\
128146
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
129147
treebanks, natively use Universal POS tags
148+
* *tnc* - Thai National Corpus (support tltk engine only)
130149
:return: a list of lists of tuples (word, POS tag)
131150
:rtype: list[list[tuple[str, str]]]
132151

pythainlp/tag/tltk.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# -*- coding: utf-8 -*-
2+
from typing import List, Tuple, Union
3+
from tltk import nlp
4+
from pythainlp.tokenize import word_tokenize
5+
6+
nlp.pos_load()
7+
nlp.ner_load()
8+
9+
10+
def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
11+
if corpus != "tnc":
12+
raise ValueError("tltk not support {0} corpus.".format(0))
13+
return nlp.pos_tag_wordlist(words)
14+
15+
16+
def _post_process(text: str) -> str:
17+
return text.replace("<s/>", " ")
18+
19+
20+
def get_ner(
21+
text: str,
22+
pos: bool = True,
23+
tag: bool = False
24+
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
25+
"""
26+
Named-entity recognizer from **TLTK**
27+
28+
This function tags named-entitiy from text in IOB format.
29+
30+
:param str text: text in Thai to be tagged
31+
:param bool pos: To include POS tags in the results (`True`) or
32+
exclude (`False`). The defualt value is `True`
33+
:param bool tag: output like html tag.
34+
:return: a list of tuple associated with tokenized word, NER tag,
35+
POS tag (if the parameter `pos` is specified as `True`),
36+
and output like html tag (if the parameter `tag` is
37+
specified as `True`).
38+
Otherwise, return a list of tuple associated with tokenized
39+
word and NER tag
40+
:rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
41+
42+
:Example:
43+
44+
>>> from pythainlp.tag.tltk import get_ner
45+
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง")
46+
[('เขา', 'PRON', 'O'),
47+
('เรียน', 'VERB', 'O'),
48+
('ที่', 'SCONJ', 'O'),
49+
('โรงเรียน', 'NOUN', 'B-L'),
50+
('นางรอง', 'VERB', 'I-L')]
51+
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False)
52+
[('เขา', 'O'),
53+
('เรียน', 'O'),
54+
('ที่', 'O'),
55+
('โรงเรียน', 'B-L'),
56+
('นางรอง', 'I-L')]
57+
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True)
58+
'เขาเรียนที่<L>โรงเรียนนางรอง</L>'
59+
"""
60+
if not text:
61+
return []
62+
list_word = []
63+
for i in word_tokenize(text, engine="tltk"):
64+
if i == " ":
65+
i = "<s/>"
66+
list_word.append(i)
67+
_pos = nlp.pos_tag_wordlist(list_word)
68+
sent_ner = [
69+
(_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos)
70+
]
71+
if tag:
72+
temp = ""
73+
sent = ""
74+
for idx, (word, pos, ner) in enumerate(sent_ner):
75+
if ner.startswith("B-") and temp != "":
76+
sent += "</" + temp + ">"
77+
temp = ner[2:]
78+
sent += "<" + temp + ">"
79+
elif ner.startswith("B-"):
80+
temp = ner[2:]
81+
sent += "<" + temp + ">"
82+
elif ner == "O" and temp != "":
83+
sent += "</" + temp + ">"
84+
temp = ""
85+
sent += word
86+
87+
if idx == len(sent_ner) - 1 and temp != "":
88+
sent += "</" + temp + ">"
89+
90+
return sent
91+
if pos is False:
92+
return [(word, ner) for word, pos, ner in sent_ner]
93+
return sent_ner

pythainlp/tokenize/core.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ def word_tokenize(
8686
and combining tokens that are parts of the same named-entity.
8787
* *sefr_cut* - wrapper for
8888
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
89+
* *tltk* - wrapper for
90+
`TLTK <https://pypi.org/project/tltk/>`_.,
8991
9092
:Note:
9193
- The parameter **custom_dict** can be provided as an argument \
@@ -182,6 +184,10 @@ def word_tokenize(
182184
elif engine == "sefr_cut":
183185
from pythainlp.tokenize.sefr_cut import segment
184186

187+
segments = segment(text)
188+
elif engine == "tltk":
189+
from pythainlp.tokenize.tltk import segment
190+
185191
segments = segment(text)
186192
else:
187193
raise ValueError(
@@ -215,6 +221,7 @@ def sent_tokenize(
215221
* *whitespace+newline* - split by whitespaces and newline.
216222
* *whitespace* - split by whitespaces. Specifiaclly, with \
217223
:class:`regex` pattern ``r" +"``
224+
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
218225
:Example:
219226
220227
Split the text based on *whitespace*::
@@ -271,6 +278,10 @@ def sent_tokenize(
271278
segments = re.split(r" +", text, re.U)
272279
elif engine == "whitespace+newline":
273280
segments = text.split()
281+
elif engine == "tltk":
282+
from pythainlp.tokenize.tltk import sent_tokenize as segment
283+
284+
segments = segment(text)
274285
else:
275286
raise ValueError(
276287
f"""Tokenizer \"{engine}\" not found.
@@ -314,6 +325,7 @@ def subword_tokenize(
314325
* *wangchanberta* - SentencePiece from wangchanberta model.
315326
* *dict* - newmm word tokenizer with a syllable dictionary
316327
* *ssg* - CRF syllable segmenter for Thai
328+
* *tltk* - syllable tokenizer from tltk
317329
318330
:Example:
319331
@@ -376,6 +388,8 @@ def subword_tokenize(
376388
)
377389
elif engine == "ssg":
378390
from pythainlp.tokenize.ssg import segment
391+
elif engine == "tltk":
392+
from pythainlp.tokenize.tltk import syllable_tokenize as segment
379393
else:
380394
raise ValueError(
381395
f"""Tokenizer \"{engine}\" not found.

pythainlp/tokenize/tltk.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- coding: utf-8 -*-
2+
from typing import List
3+
from tltk.nlp import word_segment as tltk_segment
4+
from tltk.nlp import syl_segment
5+
6+
7+
def segment(text: str) -> List[str]:
8+
if not text or not isinstance(text, str):
9+
return []
10+
text = text.replace(" ", "<u/>")
11+
_temp = tltk_segment(text).replace("<u/>", " ").replace("<s/>", "")
12+
_temp = _temp.split('|')
13+
if _temp[-1] == "":
14+
del _temp[-1]
15+
return _temp
16+
17+
18+
def syllable_tokenize(text: str) -> List[str]:
19+
if not text or not isinstance(text, str):
20+
return []
21+
_temp = syl_segment(text)
22+
_temp = _temp.split('~')
23+
if _temp[-1] == "<s/>":
24+
del _temp[-1]
25+
return _temp
26+
27+
28+
def sent_tokenize(text: str) -> List[str]:
29+
text = text.replace(" ", "<u/>")
30+
_temp = tltk_segment(text).replace("<u/>", " ").replace("|", "")
31+
_temp = _temp.split('<s/>')
32+
if _temp[-1] == "":
33+
del _temp[-1]
34+
return _temp

pythainlp/transliterate/core.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
2323
Transcription issued by Royal Institute of Thailand.
2424
* *thai2rom* - a deep learning-based Thai romanization engine
2525
(require PyTorch).
26+
* *tltk* - TLTK: Thai Language Toolkit
2627
2728
:Example:
2829
::
@@ -35,6 +36,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
3536
romanize("สามารถ", engine="thai2rom")
3637
# output: 'samat'
3738
39+
romanize("สามารถ", engine="tltk")
40+
# output: 'samat'
41+
3842
romanize("ภาพยนตร์", engine="royin")
3943
# output: 'phapn'
4044
@@ -47,6 +51,8 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
4751

4852
if engine == "thai2rom":
4953
from pythainlp.transliterate.thai2rom import romanize
54+
elif engine == "tltk":
55+
from pythainlp.transliterate.tltk import romanize
5056
else: # use default engine "royin"
5157
from pythainlp.transliterate.royin import romanize
5258

@@ -67,10 +73,13 @@ def transliterate(
6773
:rtype: str
6874
6975
:Options for engines:
70-
* *icu* - pyicu, based on International Components for Unicode (ICU)
71-
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
7276
* *thaig2p* - (default) Thai Grapheme-to-Phoneme,
7377
output is IPA (require PyTorch)
78+
* *icu* - pyicu, based on International Components for Unicode (ICU)
79+
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
80+
* *tltk_g2p* - Thai Grapheme-to-Phoneme from\
81+
`TLTK <https://pypi.org/project/tltk/>`_.,
82+
* *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA)
7483
7584
:Example:
7685
::
@@ -86,6 +95,12 @@ def transliterate(
8695
transliterate("สามารถ", engine="thaig2p")
8796
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
8897
98+
transliterate("สามารถ", engine="tltk_ipa")
99+
# output: 'saː5.maːt3'
100+
101+
transliterate("สามารถ", engine="tltk_g2p")
102+
# output: 'saa4~maat2'
103+
89104
transliterate("ภาพยนตร์", engine="icu")
90105
# output: 'p̣hāphyntr̒'
91106
@@ -103,6 +118,10 @@ def transliterate(
103118
from pythainlp.transliterate.pyicu import transliterate
104119
elif engine == "ipa":
105120
from pythainlp.transliterate.ipa import transliterate
121+
elif engine == "tltk_g2p":
122+
from pythainlp.transliterate.tltk import tltk_g2p as transliterate
123+
elif engine == "tltk_ipa":
124+
from pythainlp.transliterate.tltk import tltk_ipa as transliterate
106125
else: # use default engine: "thaig2p"
107126
from pythainlp.transliterate.thaig2p import transliterate
108127

0 commit comments

Comments
 (0)