diff --git a/docs/api/util.rst b/docs/api/util.rst index c140ce8b3..ee4933054 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -17,6 +17,7 @@ Modules .. autofunction:: eng_to_thai .. autofunction:: find_keyword .. autofunction:: countthai +.. autofunction:: count_thai_chars .. autofunction:: is_native_thai .. autofunction:: isthai .. autofunction:: isthaichar diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index b814edd68..b74fb31ca 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -9,6 +9,7 @@ "bahttext", "collate", "countthai", + "count_thai_chars", "dict_trie", "digit_to_text", "display_thai_char", @@ -83,6 +84,7 @@ from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( countthai, + count_thai_chars, display_thai_char, isthai, isthaichar, diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 2db338b70..ea770f0b3 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -5,7 +5,18 @@ import string from typing import Tuple -from pythainlp import thai_above_vowels, thai_tonemarks +from pythainlp import ( + thai_lead_vowels, + thai_follow_vowels, + thai_above_vowels, + thai_below_vowels, + thai_consonants, + thai_vowels, + thai_tonemarks, + thai_signs, + thai_digits, + thai_punctuations, +) from pythainlp.transliterate import pronunciate from pythainlp.util.syllable import tone_detector @@ -182,3 +193,74 @@ def thai_word_tone_detector(word: str) -> Tuple[str, str]: """ _pronunciate = pronunciate(word).split("-") return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate] + + +def count_thai_chars(text: str) -> dict: + """ + Count Thai characters by type + + This function will give you numbers of Thai characters by type\ + (consonants, vowels, lead_vowels, follow_vowels, above_vowels,\ + below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai) + + :param str text: Text + :return: Dict with numbers of Thai characters by type + :rtype: dict + + :Example: + :: + + from pythainlp.util import count_thai_chars + + count_thai_chars("ทดสอบภาษาไทย") + # output: { + # 'vowels': 3, + # 'lead_vowels': 1, + # 'follow_vowels': 2, + # 'above_vowels': 0, + # 'below_vowels': 0, + # 'consonants': 9, + # 'tonemarks': 0, + # 'signs': 0, + # 'thai_digits': 0, + # 'punctuations': 0, + # 'non_thai': 0 + # } + """ + _dict = { + "vowels": 0, + "lead_vowels": 0, + "follow_vowels": 0, + "above_vowels": 0, + "below_vowels": 0, + "consonants": 0, + "tonemarks": 0, + "signs": 0, + "thai_digits": 0, + "punctuations": 0, + "non_thai": 0, + } + for c in text: + if c in thai_vowels: + _dict["vowels"] += 1 + if c in thai_lead_vowels: + _dict["lead_vowels"] += 1 + elif c in thai_follow_vowels: + _dict["follow_vowels"] += 1 + elif c in thai_above_vowels: + _dict["above_vowels"] += 1 + elif c in thai_below_vowels: + _dict["below_vowels"] += 1 + elif c in thai_consonants: + _dict["consonants"] += 1 + elif c in thai_tonemarks: + _dict["tonemarks"] += 1 + elif c in thai_signs: + _dict["signs"] += 1 + elif c in thai_digits: + _dict["thai_digits"] += 1 + elif c in thai_punctuations: + _dict["punctuations"] += 1 + else: + _dict["non_thai"] += 1 + return _dict diff --git a/tests/test_util.py b/tests/test_util.py index 442fb12ad..342ba0cca 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -15,6 +15,7 @@ bahttext, collate, countthai, + count_thai_chars, dict_trie, display_thai_char, digit_to_text, @@ -595,6 +596,40 @@ def test_countthai(self): self.assertEqual(countthai("(กกต.)", ".()"), 100.0) self.assertEqual(countthai("(กกต.)", None), 50.0) + def test_count_thai_chars(self): + self.assertEquals( + count_thai_chars("ทดสอบภาษาไทย"), + { + 'vowels': 3, + 'lead_vowels': 1, + 'follow_vowels': 2, + 'above_vowels': 0, + 'below_vowels': 0, + 'consonants': 9, + 'tonemarks': 0, + 'signs': 0, + 'thai_digits': 0, + 'punctuations': 0, + 'non_thai': 0, + } + ) + self.assertEquals( + count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"), + { + 'vowels': 12, + 'lead_vowels': 6, + 'follow_vowels': 1, + 'above_vowels': 4, + 'below_vowels': 1, + 'consonants': 22, + 'tonemarks': 3, + 'signs': 2, + 'thai_digits': 1, + 'punctuations': 1, + 'non_thai': 4, + } + ) + def test_isthaichar(self): self.assertEqual(isthaichar("ก"), True) self.assertEqual(isthaichar("a"), False)