diff --git a/docs/api/util.rst b/docs/api/util.rst index 1979e44ef..9a1554707 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -258,6 +258,14 @@ Modules The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation. +.. autofunction:: spelling + :noindex: + The `spelling` function is a text processing tool for spelling Thai word. + +.. autofunction:: thai_consonant_to_spelling + +.. autofunction:: tone_to_spelling + .. autofunction:: pythainlp.util.spell_words.spell_syllable :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index f101ffe36..7613257a6 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -44,6 +44,7 @@ "reorder_vowels", "rhyme", "sound_syllable", + "spelling", "spell_words", "syllable_length", "syllable_open_close_detector", @@ -51,6 +52,7 @@ "text_to_num", "text_to_thai_digit", "th_zodiac", + "thai_consonant_to_spelling", "thai_digit_to_arabic_digit", "thai_keyboard_dist", "thai_strptime", @@ -65,6 +67,7 @@ "to_idna", "to_lunar_date", "tone_detector", + "tone_to_spelling", "words_to_num", ] @@ -134,4 +137,9 @@ syllable_open_close_detector, tone_detector, ) -from pythainlp.util.pronounce import rhyme +from pythainlp.util.pronounce import ( + rhyme, + spelling, + tone_to_spelling, + thai_consonant_to_spelling, +) diff --git a/pythainlp/util/pronounce.py b/pythainlp/util/pronounce.py index 75909f2b9..0998c81f4 100644 --- a/pythainlp/util/pronounce.py +++ b/pythainlp/util/pronounce.py @@ -3,10 +3,14 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 from typing import List +import re from pythainlp.corpus import thai_words from pythainlp.khavee import KhaveeVerifier from pythainlp.tokenize import syllable_tokenize +from pythainlp.tokenize import Tokenizer +from pythainlp import thai_consonants, thai_tonemarks +from pythainlp.util import remove_tonemark kv = KhaveeVerifier() all_thai_words_dict = None @@ -30,7 +34,7 @@ def rhyme(word: str) -> List[str]: """ global all_thai_words_dict list_sumpus = [] - if all_thai_words_dict == None: + if all_thai_words_dict is None: all_thai_words_dict = [ i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1 ] @@ -38,3 +42,141 @@ def rhyme(word: str) -> List[str]: if kv.is_sumpus(word, i) and i != word: list_sumpus.append(i) return sorted(list_sumpus) + + +thai_vowel = ''.join(( + "อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,", + "โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ" +)).split(",") +thai_vowel_all = [ + ("([ก-ฮ])ะ", "\\1อะ"), + ("([ก-ฮ])า", "\\1อา"), + ("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")), + ("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")), + ("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)), + ("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)), + ("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)), + ("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)), + ("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"), + ("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"), + ("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"), + ("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"), + ("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"), + ("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"), + ("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"), + ("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"), + ("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"), + ("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"), + ("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"), + ("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"), + ("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"), + ("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"), + ("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"), + ("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"), + ("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"), + ("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"), + ("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"), + ("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"), + ("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"), +] +thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True) + + +def thai_consonant_to_spelling(c: str) -> str: + """ + Thai consonants to spelling + + :param str c: A Thai consonant + :return: spelling + :rtype: str + + :Example: + :: + + from pythainlp.util import thai_consonant_to_spelling + + print(tone_to_spelling("ก")) + # output: กอ + """ + if len(c) == 1 and c in thai_consonants: + return c + "อ" + return c + + +def tone_to_spelling(t: str) -> str: + """ + Thai tonemarks to spelling + + :param str t: A Thai tonemarks + :return: spelling + :rtype: str + + :Example: + :: + + from pythainlp.util import tone_to_spelling + + print(tone_to_spelling("่")) # ไม้เอก + # output: ไม้เอก + """ + if t == "่": + return "ไม้เอก" + elif t == "้": + return "ไม้โท" + elif t == "๊": + return "ไม้ตรี" + elif t == "๋": + return "ไม้จัตวา" + return t + + +def spelling(word: str) -> List[str]: + """ + Thai word to spelling + + This funnction support Thai root word only. + + :param str word: A Thai word + :return: spelling + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import spelling + + print(spelling("เรียน")) + # output: ['รอ', 'เอีย', 'นอ', 'เรียน'] + + print(spelling("เฝ้า) + # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า'] + """ + if not word or not isinstance(word, str): + return [] + thai_vowel_tokenizer = Tokenizer( + custom_dict=thai_vowel + list(thai_consonants), + engine="longest" + ) + word_pre = remove_tonemark(word).replace("็", "") + tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks] + word_output = word_pre + for i, j in thai_vowel_all: + if len(re.findall(i, word_pre, re.U)) > 0: + if "็" in word and i == "เ([ก-ฮ])": + word_output = re.sub(i, "\\1เอะ", word_pre) + else: + word_output = re.sub(i, j, word_pre) + break + list_word_output = thai_vowel_tokenizer.word_tokenize(word_output) + output = [ + i for i in [thai_consonant_to_spelling(i) for i in list_word_output] + if '์' not in i + ] + if word_pre == word: + return output + [word] + elif tone != []: + return output + [word_pre, tone[0], word] + elif "็" in word: + return output + [word] + else: + return output + [word_pre, word] diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 5b8b12a91..73fcce8b9 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -66,6 +66,7 @@ to_lunar_date, tone_detector, words_to_num, + spelling, ) from pythainlp.util.morse import morse_decode, morse_encode @@ -844,6 +845,18 @@ def test_th_zodiac(self): # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) + def test_spelling(self): + self.assertEqual(spelling([]), []) + self.assertEqual(spelling("เรียน"), ['รอ', 'เอีย', 'นอ', 'เรียน']) + self.assertEqual( + spelling("เฝ้า"), ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า'] + ) + self.assertEqual(spelling("คน"), ['คอ', 'นอ', 'คน']) + self.assertEqual(spelling("กัน"), ['กอ', 'อะ', 'นอ', 'กัน']) + self.assertEqual( + spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น'] + ) + def test_longest_common_subsequence(self): self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB") self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")