Skip to content

Commit 4d95830

Browse files
authored
Merge pull request #826 from PyThaiNLP/add-abbreviation_to_full_text
Add pythainlp.util.abbreviation_to_full_text
2 parents 71719f9 + a0bf648 commit 4d95830

File tree

6 files changed

+68
-0
lines changed

6 files changed

+68
-0
lines changed

docs/api/util.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ The :class:`pythainlp.util` contains utility functions, like text conversion and
77
Modules
88
-------
99

10+
.. autofunction:: abbreviation_to_full_text
1011
.. autofunction:: arabic_digit_to_thai_digit
1112
.. autofunction:: bahttext
1213
.. autofunction:: convert_years

docs/notes/installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ where ``extras`` can be
3939
- ``wangchanglm`` (to support wangchanglm model)
4040
- ``wsd`` (to support pythainlp.wsd)
4141
- ``el`` (to support pythainlp.el)
42+
- ``abbreviation`` (to support pythainlp.util.abbreviation_to_full_text)
4243
- ``full`` (install everything)
4344

4445
For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

pythainlp/util/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
__all__ = [
2020
"Trie",
21+
"abbreviation_to_full_text",
2122
"arabic_digit_to_thai_digit",
2223
"bahttext",
2324
"convert_years",
@@ -125,3 +126,4 @@
125126
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
126127
from pythainlp.util.encoding import tis620_to_utf8
127128
import pythainlp.util.spell_words as spell_words
129+
from pythainlp.util.abbreviation import abbreviation_to_full_text

pythainlp/util/abbreviation.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
Thai abbreviation tools
17+
"""
18+
from typing import List, Tuple, Union
19+
20+
21+
def abbreviation_to_full_text(text: str, top_k: int=2) -> List[Tuple[str, Union[float, None]]]:
22+
"""
23+
This function convert Thai text (with abbreviation) to full text.
24+
25+
This function use KhamYo for handles abbreviations.
26+
See more `KhamYo <https://github.com/wannaphong/KhamYo>`_.
27+
28+
:param str text: Thai text
29+
:param int top_k: Top K
30+
:return: Thai full text that handles abbreviations as full text and cos scores (original text - modified text).
31+
:rtype: List[Tuple[str, Union[float, None]]]
32+
33+
:Example:
34+
::
35+
36+
from pythainlp.util import abbreviation_to_full_text
37+
38+
text = "รร.ของเราน่าอยู่"
39+
40+
abbreviation_to_full_text(text)
41+
# output: [
42+
# ('โรงเรียนของเราน่าอยู่', tensor(0.3734)),
43+
# ('โรงแรมของเราน่าอยู่', tensor(0.2438))
44+
# ]
45+
"""
46+
try:
47+
from khamyo import replace as _replace
48+
except ImportError:
49+
raise ImportError(
50+
"""
51+
This funtion need to use khamyo.
52+
You can install by pip install khamyo or
53+
pip install pythainlp[abbreviation].
54+
"""
55+
)
56+
return _replace(text, top_k=top_k)

setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@
128128
"el":{
129129
"multiel>=0.5"
130130
},
131+
"abbreviation":{
132+
"khamyo>=0.2.0"
133+
},
131134
"full": [
132135
"PyYAML>=5.3.1",
133136
"attacut>=1.0.4",
@@ -162,6 +165,7 @@
162165
"ufal.chu-liu-edmonds>=1.0.2",
163166
"panphon>=0.20.0",
164167
"sentence-transformers>=2.2.2",
168+
"khamyo>=0.2.0",
165169
],
166170
}
167171

tests/test_util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
1212
from pythainlp.util import (
1313
Trie,
14+
abbreviation_to_full_text,
1415
arabic_digit_to_thai_digit,
1516
bahttext,
1617
collate,
@@ -851,3 +852,6 @@ def test_spell_word(self):
851852
self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
852853
self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
853854
self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
855+
856+
def test_abbreviation_to_full_text(self):
857+
self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

0 commit comments

Comments
 (0)