Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Modules
.. autofunction:: remove_tonemark
.. autofunction:: remove_zw
.. autofunction:: reorder_vowels
.. autofunction:: sound_syllable
.. autofunction:: text_to_arabic_digit
.. autofunction:: text_to_num
.. autofunction:: text_to_thai_digit
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"time_to_thaiword",
"text_to_num",
"words_to_num",
"sound_syllable",
]

from pythainlp.util.collate import collate
Expand Down Expand Up @@ -89,3 +90,4 @@
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
from pythainlp.util.trie import Trie, dict_trie
from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num
from pythainlp.util.syllable import sound_syllable
113 changes: 113 additions & 0 deletions pythainlp/util/syllable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# -*- coding: utf-8 -*-
"""
Syllable tools
"""
import re
from pythainlp import thai_consonants

spelling_class = {
"กง": list("ง"),
"กม": list("ม"),
"เกย": list("ย"),
"เกอว": list("ว"),
"กน": list("นญณรลฬ"),
"กก": list("กขคฆ"),
"กด": list("ดจชซฎฏฐฑฒตถทธศษส"),
"กบ": list("บปภพฟ")
}

thai_consonants_all = list(thai_consonants)
thai_consonants_all.remove("อ")

_temp = list(
"".join(["".join(spelling_class[i]) for i in spelling_class.keys()])
)
not_spelling_class = [j for j in thai_consonants_all if j not in _temp]

# vowel's short sound
short = "ะัิึุ"
re_short = re.compile("เ(.*)ะ|แ(.*)ะ|เ(.*)อะ|โ(.*)ะ|เ(.*)าะ", re.U)
pattern = re.compile("เ(.*)า", re.U) # เ-า is live syllable

_check_1 = []
# these spelling consonant are live syllable.
for i in ["กง", "กน", "กม", "เกย", "เกอว"]:
_check_1.extend(spelling_class[i])
# these spelling consonant are dead syllable.
_check_2 = spelling_class["กก"]+spelling_class["กบ"]+spelling_class["กด"]


def sound_syllable(syllable: str) -> str:
"""
Sound syllable classification

This function is sound syllable classification.
It is live syllable or dead syllable.

:param str syllable: Thai syllable
:return: syllable's type (live or dead)
:rtype: str

:Example:
::

from pythainlp.util import sound_syllable

print(sound_syllable("มา"))
# output: live

print(sound_syllable("เลข"))
# output: dead
"""
# get consonants
consonants = [i for i in syllable if i in list(thai_consonants_all)]
# get spelling consonants
spelling_consonant = consonants[-1]
# if len of syllable < 2
if len(syllable) < 2:
return "dead"
elif (
(
spelling_consonant in _check_2)
and
(
any((c in set("าีืแูาเโ")) for c in syllable) == False
and any((c in set("ำใไ")) for c in syllable) == False
and bool(pattern.search(syllable)) != True
)
):
return "dead"
elif any((c in set("าีืแูาโ")) for c in syllable): # in syllable:
if spelling_consonant in _check_1 and bool(re_short.search(syllable)) != True:
return "live"
elif spelling_consonant != syllable[-1] and bool(re_short.search(syllable)) != True:
return "live"
elif spelling_consonant in _check_2:
return "dead"
elif (
bool(re_short.search(syllable))
or
any((c in set(short)) for c in syllable)
):
return "dead"
return "live"
elif any((c in set("ำใไ")) for c in syllable):
return "live" # if these vowel's long sound are live syllable
elif bool(pattern.search(syllable)): # if it is เ-า
return "live"
elif spelling_consonant in _check_1:
if (
bool(re_short.search(syllable))
or
any((c in set(short)) for c in syllable)
) and len(consonants) < 2:
return "dead"
return "live"
elif (
bool(re_short.search(syllable)) # if found vowel's short sound
or
any((c in set(short)) for c in syllable) # consonant in short
):
return "dead"
else:
return "dead"
42 changes: 42 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
thai_keyboard_dist,
text_to_num,
words_to_num,
sound_syllable,
)


Expand Down Expand Up @@ -661,3 +662,44 @@ def test_emoji_to_thai(self):
emoji_to_thai("🇹🇭 นี่คือธงประเทศไทย"),
":ธง_ไทย: นี่คือธงประเทศไทย",
)

def test_sound_syllable(self):
test = [
("มา", "live"),
("ดู", "live"),
("ปู", "live"),
("เวลา", "live"),
("ปี", "live"),
("จำ", "live"),
("น้ำ", "live"),
("ใช่", "live"),
("เผ่า", "live"),
("เสา", "live"),
("ไป", "live"),
("จริง", "live"),
("กิน", "live"),
("กำ", "live"),
("มา", "live"),
("สาว", "live"),
("ฉุย", "live"),
("ธุ", "dead"),
("ระ", "dead"),
("กะ", "dead"),
("ทิ", "dead"),
("เกะ", "dead"),
("กะ", "dead"),
("บท", "dead"),
("บาท", "dead"),
("ลาภ", "dead"),
("เมฆ", "dead"),
("เลข", "dead"),
("ธูป", "dead"),
("บ", "dead"),
("บ่", "dead"),
("ก็", "dead"),
("เพราะ", "dead"),
("เกาะ", "dead"),
("แคะ", "dead"),
]
for i, j in test:
self.assertEqual(sound_syllable(i), j)