From d680c3b0484cf3b0df6de2b4aee0a0ece93a8322 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 3 Dec 2023 19:01:23 +0700 Subject: [PATCH 1/5] Add pythainlp.util.thai_to_idn --- docs/api/util.rst | 5 +++++ pythainlp/util/__init__.py | 3 ++- pythainlp/util/encoding.py | 19 +++++++++++++++++++ tests/test_util.py | 4 ++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index 41b635d93..401169feb 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -218,6 +218,11 @@ Modules The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. +.. autofunction:: thai_to_idn + :noindex: + + The `thai_to_idn` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. + .. autofunction:: thai_word_tone_detector :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 55302507b..5ac977ade 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -62,6 +62,7 @@ "thai_strptime", "thai_strftime", "thai_to_eng", + "thai_to_idn", "thai_word_tone_detector", "thaiword_to_date", "thaiword_to_num", @@ -128,7 +129,7 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa -from pythainlp.util.encoding import tis620_to_utf8 +from pythainlp.util.encoding import thai_to_idn, tis620_to_utf8 from pythainlp.util import spell_words from pythainlp.util.abbreviation import abbreviation_to_full_text from pythainlp.util.pronounce import rhyme diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 91f18f411..3853dbaac 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -29,3 +29,22 @@ def tis620_to_utf8(text: str)->str: # output: 'กระทรวงอุตสาหกรรม' """ return text.encode("cp1252", "ignore").decode("tis-620") + + +def thai_to_idn(text: str)->str: + """ + Convert Thai text to International Domain Name (IDN) for Thai domain name. + + :param str text: Thai text + :return: Text that uses IDNA encoding + :rtype: str + + :Example: + :: + + from pythainlp.util import thai_to_idn + + thai_to_idn("คนละครึ่ง.com") + # output: 'xn--42caj4e6bk1f5b1j.com' + """ + return text.encode("idna").decode("utf-8") diff --git a/tests/test_util.py b/tests/test_util.py index e45319c99..95cf0c87f 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -45,6 +45,7 @@ thaiword_to_time, time_to_thaiword, thai_to_eng, + thai_to_idn, thaiword_to_num, thai_keyboard_dist, text_to_num, @@ -780,6 +781,9 @@ def test_syllable_open_close_detector(self): self.assertEqual(syllable_open_close_detector("มาก"), "close") self.assertEqual(syllable_open_close_detector("คะ"), "open") + def test_thai_to_idn(self): + self.assertEqual(thai_to_idn("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com") + def test_thai_word_tone_detector(self): self.assertIsNotNone(thai_word_tone_detector("คนดี")) self.assertEqual( From 031805499780b00ba086f4ffc7bf66fb2373edff Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 3 Dec 2023 19:03:22 +0700 Subject: [PATCH 2/5] Update encoding.py --- pythainlp/util/encoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 3853dbaac..aae067157 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -def tis620_to_utf8(text: str)->str: +def tis620_to_utf8(text: str) -> str: """ Convert TIS-620 to UTF-8 @@ -31,7 +31,7 @@ def tis620_to_utf8(text: str)->str: return text.encode("cp1252", "ignore").decode("tis-620") -def thai_to_idn(text: str)->str: +def thai_to_idn(text: str) -> str: """ Convert Thai text to International Domain Name (IDN) for Thai domain name. From 75082cc1c716294c86ced0cdfc9a1afb06914bad Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Dec 2023 22:50:55 +0000 Subject: [PATCH 3/5] Update encoding.py --- pythainlp/util/encoding.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index aae067157..30a4b8b0f 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -1,23 +1,12 @@ # -*- coding_utf-8 -*- -# Copyright (C) 2016-2023 PyThaiNLP Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 def tis620_to_utf8(text: str) -> str: """ Convert TIS-620 to UTF-8 - :param str text: Text that uses TIS-620 encoding - :return: Text that uses UTF-8 encoding + :param str text: TIS-620 encoded text + :return: UTF-8 encoded text :rtype: str :Example: @@ -33,10 +22,10 @@ def tis620_to_utf8(text: str) -> str: def thai_to_idn(text: str) -> str: """ - Convert Thai text to International Domain Name (IDN) for Thai domain name. + Encode text with Punycode, as used in Internationalized Domain Name (IDN). :param str text: Thai text - :return: Text that uses IDNA encoding + :return: Text in IDNA encoding :rtype: str :Example: From 899d086370e9328f864581e933512707f67465ae Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Dec 2023 22:56:25 +0000 Subject: [PATCH 4/5] Update encoding.py --- pythainlp/util/encoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 30a4b8b0f..17e30f380 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -22,10 +22,10 @@ def tis620_to_utf8(text: str) -> str: def thai_to_idn(text: str) -> str: """ - Encode text with Punycode, as used in Internationalized Domain Name (IDN). + Encode text with IDNA, as used in Internationalized Domain Name (IDN). :param str text: Thai text - :return: Text in IDNA encoding + :return: IDNA-encoded text :rtype: str :Example: From bf53e0936526cf1078361b3bc9f39006c8cc07fb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 4 Dec 2023 15:39:16 +0700 Subject: [PATCH 5/5] Change thai_to_idn to to_idna --- docs/api/util.rst | 4 ++-- pythainlp/util/__init__.py | 4 ++-- pythainlp/util/encoding.py | 6 +++--- tests/test_util.py | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index 401169feb..bb7efbfd3 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -218,10 +218,10 @@ Modules The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. -.. autofunction:: thai_to_idn +.. autofunction:: to_idna :noindex: - The `thai_to_idn` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. + The `to_idna` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. .. autofunction:: thai_word_tone_detector :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 5ac977ade..2553f04fc 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -62,7 +62,7 @@ "thai_strptime", "thai_strftime", "thai_to_eng", - "thai_to_idn", + "to_idna", "thai_word_tone_detector", "thaiword_to_date", "thaiword_to_num", @@ -129,7 +129,7 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa -from pythainlp.util.encoding import thai_to_idn, tis620_to_utf8 +from pythainlp.util.encoding import to_idna, tis620_to_utf8 from pythainlp.util import spell_words from pythainlp.util.abbreviation import abbreviation_to_full_text from pythainlp.util.pronounce import rhyme diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 17e30f380..8826e4368 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -20,7 +20,7 @@ def tis620_to_utf8(text: str) -> str: return text.encode("cp1252", "ignore").decode("tis-620") -def thai_to_idn(text: str) -> str: +def to_idna(text: str) -> str: """ Encode text with IDNA, as used in Internationalized Domain Name (IDN). @@ -31,9 +31,9 @@ def thai_to_idn(text: str) -> str: :Example: :: - from pythainlp.util import thai_to_idn + from pythainlp.util import to_idna - thai_to_idn("คนละครึ่ง.com") + to_idna("คนละครึ่ง.com") # output: 'xn--42caj4e6bk1f5b1j.com' """ return text.encode("idna").decode("utf-8") diff --git a/tests/test_util.py b/tests/test_util.py index 95cf0c87f..ee02a278c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -45,7 +45,7 @@ thaiword_to_time, time_to_thaiword, thai_to_eng, - thai_to_idn, + to_idna, thaiword_to_num, thai_keyboard_dist, text_to_num, @@ -781,8 +781,8 @@ def test_syllable_open_close_detector(self): self.assertEqual(syllable_open_close_detector("มาก"), "close") self.assertEqual(syllable_open_close_detector("คะ"), "open") - def test_thai_to_idn(self): - self.assertEqual(thai_to_idn("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com") + def test_to_idna(self): + self.assertEqual(to_idna("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com") def test_thai_word_tone_detector(self): self.assertIsNotNone(thai_word_tone_detector("คนดี"))