Skip to content

Commit 484c034

Browse files
authored
Merge branch 'dev' into add-pythainlp-chat
2 parents 5522ac2 + e4f9c9b commit 484c034

19 files changed

+995
-15
lines changed

docker_requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ sentencepiece==0.1.91
1212
ssg==0.0.8
1313
torch==1.13.1
1414
fastai==1.0.61
15-
transformers==4.22.1
15+
transformers==4.30.0
1616
phunspell==0.1.6
1717
spylls==0.1.5
1818
symspellpy==6.7.6
@@ -37,3 +37,4 @@ ufal.chu-liu-edmonds==1.0.2
3737
wtpsplit==1.0.1
3838
fastcoref==2.1.6
3939
panphon==0.20.0
40+
sentence-transformers==2.2.2

docs/api/corpus.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ Modules
1616
.. autofunction:: download
1717
.. autofunction:: remove
1818
.. autofunction:: provinces
19+
.. autofunction:: thai_dict
1920
.. autofunction:: thai_stopwords
2021
.. autofunction:: thai_words
22+
.. autofunction:: thai_wsd_dict
2123
.. autofunction:: thai_orst_words
2224
.. autofunction:: thai_syllables
2325
.. autofunction:: thai_negations

docs/api/util.rst

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,26 +11,29 @@ Modules
1111
.. autofunction:: bahttext
1212
.. autofunction:: convert_years
1313
.. autofunction:: collate
14+
.. autofunction:: count_thai_chars
15+
.. autofunction:: countthai
1416
.. autofunction:: dict_trie
1517
.. autofunction:: digit_to_text
1618
.. autofunction:: display_thai_char
1719
.. autofunction:: emoji_to_thai
1820
.. autofunction:: eng_to_thai
1921
.. autofunction:: find_keyword
20-
.. autofunction:: countthai
21-
.. autofunction:: count_thai_chars
22+
.. autofunction:: ipa_to_rtgs
2223
.. autofunction:: is_native_thai
2324
.. autofunction:: isthai
2425
.. autofunction:: isthaichar
26+
.. autofunction:: maiyamok
27+
.. autofunction:: nectec_to_ipa
2528
.. autofunction:: normalize
2629
.. autofunction:: now_reign_year
2730
.. autofunction:: num_to_thaiword
28-
.. autofunction:: maiyamok
2931
.. autofunction:: rank
3032
.. autofunction:: reign_year_to_ad
3133
.. autofunction:: remove_dangling
3234
.. autofunction:: remove_dup_spaces
3335
.. autofunction:: remove_repeat_vowels
36+
.. autofunction:: remove_tone_ipa
3437
.. autofunction:: remove_tonemark
3538
.. autofunction:: remove_zw
3639
.. autofunction:: reorder_vowels
@@ -40,20 +43,19 @@ Modules
4043
.. autofunction:: text_to_arabic_digit
4144
.. autofunction:: text_to_num
4245
.. autofunction:: text_to_thai_digit
46+
.. autofunction:: thai_digit_to_arabic_digit
4347
.. autofunction:: thai_strftime
4448
.. autofunction:: thai_strptime
4549
.. autofunction:: thai_to_eng
4650
.. autofunction:: thai_word_tone_detector
47-
.. autofunction:: thai_digit_to_arabic_digit
4851
.. autofunction:: thaiword_to_date
4952
.. autofunction:: thaiword_to_num
5053
.. autofunction:: thaiword_to_time
5154
.. autofunction:: time_to_thaiword
5255
.. autofunction:: tis620_to_utf8
5356
.. autofunction:: tone_detector
5457
.. autofunction:: words_to_num
55-
.. autofunction:: nectec_to_ipa
56-
.. autofunction:: ipa_to_rtgs
57-
.. autofunction:: remove_tone_ipa
58+
.. autofunction:: pythainlp.util.spell_words.spell_syllable
59+
.. autofunction:: pythainlp.util.spell_words.spell_word
5860
.. autoclass:: Trie
5961
:members:

docs/api/wsd.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
.. currentmodule:: pythainlp.wsd
2+
3+
pythainlp.wsd
4+
=============
5+
6+
The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD).
7+
8+
9+
Modules
10+
-------
11+
12+
.. autofunction:: get_sense

docs/notes/installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ where ``extras`` can be
3737
- ``dependency_parsing`` (to support dependency parsing with all engine)
3838
- ``coreference_resolution`` (to support coreference esolution with all engine)
3939
- ``wangchanglm`` (to support wangchanglm model)
40+
- ``wsd`` (to support pythainlp.wsd)
4041
- ``full`` (install everything)
4142

4243
For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

notebooks/test_wsd.ipynb

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "70e6b5ba-063d-4e53-a312-2380b49bc3a9",
7+
"metadata": {
8+
"tags": []
9+
},
10+
"outputs": [],
11+
"source": [
12+
"from pythainlp.wsd import get_sense"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 9,
18+
"id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
19+
"metadata": {
20+
"tags": []
21+
},
22+
"outputs": [
23+
{
24+
"name": "stdout",
25+
"output_type": "stream",
26+
"text": [
27+
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
28+
]
29+
}
30+
],
31+
"source": [
32+
"print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 10,
38+
"id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
39+
"metadata": {
40+
"tags": []
41+
},
42+
"outputs": [
43+
{
44+
"name": "stdout",
45+
"output_type": "stream",
46+
"text": [
47+
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": 4,
58+
"id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4",
59+
"metadata": {},
60+
"outputs": [
61+
{
62+
"name": "stdout",
63+
"output_type": "stream",
64+
"text": [
65+
"None\n"
66+
]
67+
}
68+
],
69+
"source": [
70+
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 5,
76+
"id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162",
77+
"metadata": {
78+
"tags": []
79+
},
80+
"outputs": [],
81+
"source": [
82+
"from pythainlp.corpus import get_corpus_path, thai_wsd_dict"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 6,
88+
"id": "0f88ff4c-06db-4cba-8086-4bb2160bead0",
89+
"metadata": {
90+
"tags": []
91+
},
92+
"outputs": [],
93+
"source": [
94+
"_w=thai_wsd_dict()"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 7,
100+
"id": "83642893-d9a6-4271-a1b7-5e57638a74d4",
101+
"metadata": {
102+
"tags": []
103+
},
104+
"outputs": [
105+
{
106+
"data": {
107+
"text/plain": [
108+
"dict_keys(['word', 'meaning'])"
109+
]
110+
},
111+
"execution_count": 7,
112+
"metadata": {},
113+
"output_type": "execute_result"
114+
}
115+
],
116+
"source": [
117+
"_w.keys()"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": 8,
123+
"id": "bb67c468-ce65-4581-adc6-832d70cfabab",
124+
"metadata": {
125+
"tags": []
126+
},
127+
"outputs": [
128+
{
129+
"data": {
130+
"text/plain": [
131+
"('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])"
132+
]
133+
},
134+
"execution_count": 8,
135+
"metadata": {},
136+
"output_type": "execute_result"
137+
}
138+
],
139+
"source": [
140+
"_w[\"word\"][0],_w[\"meaning\"][0]"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"id": "27fbe522-019f-4157-a9a8-50ae62b50727",
147+
"metadata": {},
148+
"outputs": [],
149+
"source": []
150+
}
151+
],
152+
"metadata": {
153+
"kernelspec": {
154+
"display_name": "Python 3 (ipykernel)",
155+
"language": "python",
156+
"name": "python3"
157+
},
158+
"language_info": {
159+
"codemirror_mode": {
160+
"name": "ipython",
161+
"version": 3
162+
},
163+
"file_extension": ".py",
164+
"mimetype": "text/x-python",
165+
"name": "python",
166+
"nbconvert_exporter": "python",
167+
"pygments_lexer": "ipython3",
168+
"version": "3.8.10"
169+
}
170+
},
171+
"nbformat": 4,
172+
"nbformat_minor": 5
173+
}

pythainlp/corpus/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@
3232
"get_corpus_path",
3333
"provinces",
3434
"remove",
35+
"thai_dict",
3536
"thai_family_names",
3637
"thai_female_names",
3738
"thai_male_names",
3839
"thai_negations",
3940
"thai_stopwords",
4041
"thai_syllables",
4142
"thai_words",
43+
"thai_wsd_dict",
4244
"thai_orst_words",
4345
"path_pythainlp_corpus",
4446
"get_path_folder_corpus",
@@ -112,4 +114,6 @@ def corpus_db_path() -> str:
112114
thai_syllables,
113115
thai_words,
114116
thai_orst_words,
117+
thai_dict,
118+
thai_wsd_dict
115119
)

pythainlp/corpus/common.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@
2626
"thai_stopwords",
2727
"thai_syllables",
2828
"thai_words",
29+
"thai_dict",
30+
"thai_wsd_dict",
2931
]
3032

3133
from typing import FrozenSet, List, Union
3234

33-
from pythainlp.corpus import get_corpus
35+
from pythainlp.corpus import get_corpus, get_corpus_path
3436

3537
_THAI_COUNTRIES = set()
3638
_THAI_COUNTRIES_FILENAME = "countries_th.txt"
@@ -60,6 +62,9 @@
6062

6163
_THAI_ORST_WORDS = set()
6264

65+
_THAI_DICT = {}
66+
_THAI_WSD_DICT = {}
67+
6368

6469
def countries() -> FrozenSet[str]:
6570
"""
@@ -256,3 +261,51 @@ def thai_male_names() -> FrozenSet[str]:
256261
_THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME)
257262

258263
return _THAI_MALE_NAMES
264+
265+
266+
def thai_dict() -> dict:
267+
"""
268+
Return Thai dictionary with definition from wiktionary.
269+
\n(See: `thai_dict\
270+
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
271+
272+
:return: Thai word with part-of-speech type and definition
273+
:rtype: :class:`frozenset`
274+
"""
275+
global _THAI_DICT
276+
if _THAI_DICT == {}:
277+
import csv
278+
_THAI_DICT = {"word":[], "meaning":[]}
279+
with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
280+
reader = csv.DictReader(csvfile, delimiter=",")
281+
for row in reader:
282+
_THAI_DICT["word"].append(row["word"])
283+
_THAI_DICT["meaning"].append(row["meaning"])
284+
285+
return _THAI_DICT
286+
287+
288+
def thai_wsd_dict() -> dict:
289+
"""
290+
Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.
291+
\n(See: `thai_dict\
292+
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
293+
294+
:return: Thai word with part-of-speech type and definition
295+
:rtype: :class:`frozenset`
296+
"""
297+
global _THAI_WSD_DICT
298+
if _THAI_WSD_DICT == {}:
299+
_thai_wsd = thai_dict()
300+
_THAI_WSD_DICT = {"word":[],"meaning":[]}
301+
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
302+
_all_value = list(eval(j).values())
303+
_use = []
304+
for k in _all_value:
305+
_use.extend(k)
306+
_use=list(set(_use))
307+
if len(_use)>1:
308+
_THAI_WSD_DICT["word"].append(i)
309+
_THAI_WSD_DICT["meaning"].append(_use)
310+
311+
return _THAI_WSD_DICT

0 commit comments

Comments
 (0)