Skip to content

Commit 7d77cc9

Browse files
authored
Merge pull request #875 from PyThaiNLP/add-thai_to_idn
Add pythainlp.util.to_idn
2 parents d213a8a + 4ef431a commit 7d77cc9

File tree

4 files changed

+32
-3
lines changed

4 files changed

+32
-3
lines changed

docs/api/util.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ Modules
218218

219219
The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context.
220220

221+
.. autofunction:: to_idna
222+
:noindex:
223+
224+
The `to_idna` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name.
225+
221226
.. autofunction:: thai_word_tone_detector
222227
:noindex:
223228

pythainlp/util/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"thai_strptime",
5252
"thai_strftime",
5353
"thai_to_eng",
54+
"to_idna",
5455
"thai_word_tone_detector",
5556
"thaiword_to_date",
5657
"thaiword_to_num",
@@ -117,7 +118,7 @@
117118
syllable_open_close_detector,
118119
)
119120
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
120-
from pythainlp.util.encoding import tis620_to_utf8
121+
from pythainlp.util.encoding import to_idna, tis620_to_utf8
121122
from pythainlp.util import spell_words
122123
from pythainlp.util.abbreviation import abbreviation_to_full_text
123124
from pythainlp.util.pronounce import rhyme

pythainlp/util/encoding.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ def tis620_to_utf8(text: str)->str:
55
"""
66
Convert TIS-620 to UTF-8
77
8-
:param str text: Text that uses TIS-620 encoding
9-
:return: Text that uses UTF-8 encoding
8+
:param str text: TIS-620 encoded text
9+
:return: UTF-8 encoded text
1010
:rtype: str
1111
1212
:Example:
@@ -18,3 +18,22 @@ def tis620_to_utf8(text: str)->str:
1818
# output: 'กระทรวงอุตสาหกรรม'
1919
"""
2020
return text.encode("cp1252", "ignore").decode("tis-620")
21+
22+
23+
def to_idna(text: str) -> str:
24+
"""
25+
Encode text with IDNA, as used in Internationalized Domain Name (IDN).
26+
27+
:param str text: Thai text
28+
:return: IDNA-encoded text
29+
:rtype: str
30+
31+
:Example:
32+
::
33+
34+
from pythainlp.util import to_idna
35+
36+
to_idna("คนละครึ่ง.com")
37+
# output: 'xn--42caj4e6bk1f5b1j.com'
38+
"""
39+
return text.encode("idna").decode("utf-8")

tests/test_util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
thaiword_to_time,
4646
time_to_thaiword,
4747
thai_to_eng,
48+
to_idna,
4849
thaiword_to_num,
4950
thai_keyboard_dist,
5051
text_to_num,
@@ -780,6 +781,9 @@ def test_syllable_open_close_detector(self):
780781
self.assertEqual(syllable_open_close_detector("มาก"), "close")
781782
self.assertEqual(syllable_open_close_detector("คะ"), "open")
782783

784+
def test_to_idna(self):
785+
self.assertEqual(to_idna("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com")
786+
783787
def test_thai_word_tone_detector(self):
784788
self.assertIsNotNone(thai_word_tone_detector("คนดี"))
785789
self.assertEqual(

0 commit comments

Comments
 (0)