From 410b4a8b0708ca5fea838529de41a3825bb378cf Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 27 Jun 2023 20:35:37 +0700 Subject: [PATCH 1/2] Add pythainlp.util.tis620_to_utf8 --- docs/api/util.rst | 1 + pythainlp/util/__init__.py | 2 ++ pythainlp/util/encoding.py | 29 +++++++++++++++++++++++++++++ tests/test_util.py | 4 ++++ 4 files changed, 36 insertions(+) create mode 100644 pythainlp/util/encoding.py diff --git a/docs/api/util.rst b/docs/api/util.rst index 3854788ba..491c7579e 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -49,6 +49,7 @@ Modules .. autofunction:: thaiword_to_num .. autofunction:: thaiword_to_time .. autofunction:: time_to_thaiword +.. autofunction:: tis620_to_utf8 .. autofunction:: tone_detector .. autofunction:: words_to_num .. autofunction:: nectec_to_ipa diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index aa8ef370d..c468251ac 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -65,6 +65,7 @@ "nectec_to_ipa", "ipa_to_rtgs", "remove_tone_ipa", + "tis620_to_utf8", ] from pythainlp.util.collate import collate @@ -121,3 +122,4 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa +from pythainlp.util.encoding import tis620_to_utf8 diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py new file mode 100644 index 000000000..ce1133bff --- /dev/null +++ b/pythainlp/util/encoding.py @@ -0,0 +1,29 @@ +# -*- coding_utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +def tis620_to_utf8(text: str)->str: + """ + Convert TIS-620 to UTF-8 + + :param str text: Text that use TIS-620 encoding + :return: Text that use UTF-8 encoding + :rtype: str + + :Example: + + from pythainlp.util import tis620_to_utf8 + tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ") + # output: 'กระทรวงอุตสาหกรรม' + """ + return text.encode('cp1252').decode('tis-620') diff --git a/tests/test_util.py b/tests/test_util.py index 59db4098c..34d12c06a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -57,6 +57,7 @@ nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa, + tis620_to_utf8, ) @@ -840,3 +841,6 @@ def test_ipa_to_rtgs(self): def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") + + def test_tis620_to_utf8(self): + self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") From f7f762e10a33813a5b200b23cbf77df0752c7174 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 27 Jun 2023 23:22:48 +0700 Subject: [PATCH 2/2] Update encoding.py --- pythainlp/util/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index ce1133bff..8af43f3cf 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -26,4 +26,4 @@ def tis620_to_utf8(text: str)->str: tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ") # output: 'กระทรวงอุตสาหกรรม' """ - return text.encode('cp1252').decode('tis-620') + return text.encode("cp1252", "ignore").decode("tis-620")