Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Modules
.. autofunction:: eng_to_thai
.. autofunction:: find_keyword
.. autofunction:: countthai
.. autofunction:: count_thai_chars
.. autofunction:: is_native_thai
.. autofunction:: isthai
.. autofunction:: isthaichar
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"bahttext",
"collate",
"countthai",
"count_thai_chars",
"dict_trie",
"digit_to_text",
"display_thai_char",
Expand Down Expand Up @@ -83,6 +84,7 @@
from pythainlp.util.strftime import thai_strftime
from pythainlp.util.thai import (
countthai,
count_thai_chars,
display_thai_char,
isthai,
isthaichar,
Expand Down
84 changes: 83 additions & 1 deletion pythainlp/util/thai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,18 @@
import string
from typing import Tuple

from pythainlp import thai_above_vowels, thai_tonemarks
from pythainlp import (
thai_lead_vowels,
thai_follow_vowels,
thai_above_vowels,
thai_below_vowels,
thai_consonants,
thai_vowels,
thai_tonemarks,
thai_signs,
thai_digits,
thai_punctuations,
)
from pythainlp.transliterate import pronunciate
from pythainlp.util.syllable import tone_detector

Expand Down Expand Up @@ -182,3 +193,74 @@ def thai_word_tone_detector(word: str) -> Tuple[str, str]:
"""
_pronunciate = pronunciate(word).split("-")
return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]


def count_thai_chars(text: str) -> dict:
"""
Count Thai characters by type

This function will give you numbers of Thai characters by type\
(consonants, vowels, lead_vowels, follow_vowels, above_vowels,\
below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai)

:param str text: Text
:return: Dict with numbers of Thai characters by type
:rtype: dict

:Example:
::

from pythainlp.util import count_thai_chars

count_thai_chars("ทดสอบภาษาไทย")
# output: {
# 'vowels': 3,
# 'lead_vowels': 1,
# 'follow_vowels': 2,
# 'above_vowels': 0,
# 'below_vowels': 0,
# 'consonants': 9,
# 'tonemarks': 0,
# 'signs': 0,
# 'thai_digits': 0,
# 'punctuations': 0,
# 'non_thai': 0
# }
"""
_dict = {
"vowels": 0,
"lead_vowels": 0,
"follow_vowels": 0,
"above_vowels": 0,
"below_vowels": 0,
"consonants": 0,
"tonemarks": 0,
"signs": 0,
"thai_digits": 0,
"punctuations": 0,
"non_thai": 0,
}
for c in text:
if c in thai_vowels:
_dict["vowels"] += 1
if c in thai_lead_vowels:
_dict["lead_vowels"] += 1
elif c in thai_follow_vowels:
_dict["follow_vowels"] += 1
elif c in thai_above_vowels:
_dict["above_vowels"] += 1
elif c in thai_below_vowels:
_dict["below_vowels"] += 1
elif c in thai_consonants:
_dict["consonants"] += 1
elif c in thai_tonemarks:
_dict["tonemarks"] += 1
elif c in thai_signs:
_dict["signs"] += 1
elif c in thai_digits:
_dict["thai_digits"] += 1
elif c in thai_punctuations:
_dict["punctuations"] += 1
else:
_dict["non_thai"] += 1
return _dict
35 changes: 35 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
bahttext,
collate,
countthai,
count_thai_chars,
dict_trie,
display_thai_char,
digit_to_text,
Expand Down Expand Up @@ -595,6 +596,40 @@ def test_countthai(self):
self.assertEqual(countthai("(กกต.)", ".()"), 100.0)
self.assertEqual(countthai("(กกต.)", None), 50.0)

def test_count_thai_chars(self):
self.assertEquals(
count_thai_chars("ทดสอบภาษาไทย"),
{
'vowels': 3,
'lead_vowels': 1,
'follow_vowels': 2,
'above_vowels': 0,
'below_vowels': 0,
'consonants': 9,
'tonemarks': 0,
'signs': 0,
'thai_digits': 0,
'punctuations': 0,
'non_thai': 0,
}
)
self.assertEquals(
count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"),
{
'vowels': 12,
'lead_vowels': 6,
'follow_vowels': 1,
'above_vowels': 4,
'below_vowels': 1,
'consonants': 22,
'tonemarks': 3,
'signs': 2,
'thai_digits': 1,
'punctuations': 1,
'non_thai': 4,
}
)

def test_isthaichar(self):
self.assertEqual(isthaichar("ก"), True)
self.assertEqual(isthaichar("a"), False)
Expand Down