From ae9d6f95fbb17d2b5101e8f0394b46a917feff18 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 27 Oct 2022 21:55:58 +0700 Subject: [PATCH 1/3] Add pythainlp.util.count_thai_chars pythainlp.util.count_thai_chars is the function will give you numbers of Thai characters by type. --- docs/api/util.rst | 1 + pythainlp/util/__init__.py | 2 + pythainlp/util/thai.py | 84 +++++++++++++++++++++++++++++++++++++- tests/test_util.py | 35 ++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index c140ce8b3..ee4933054 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -17,6 +17,7 @@ Modules .. autofunction:: eng_to_thai .. autofunction:: find_keyword .. autofunction:: countthai +.. autofunction:: count_thai_chars .. autofunction:: is_native_thai .. autofunction:: isthai .. autofunction:: isthaichar diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index b814edd68..b74fb31ca 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -9,6 +9,7 @@ "bahttext", "collate", "countthai", + "count_thai_chars", "dict_trie", "digit_to_text", "display_thai_char", @@ -83,6 +84,7 @@ from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( countthai, + count_thai_chars, display_thai_char, isthai, isthaichar, diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 2db338b70..aa56f360e 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -5,7 +5,18 @@ import string from typing import Tuple -from pythainlp import thai_above_vowels, thai_tonemarks +from pythainlp import ( + thai_lead_vowels, + thai_follow_vowels, + thai_above_vowels, + thai_below_vowels, + thai_consonants, + thai_vowels, + thai_tonemarks, + thai_signs, + thai_digits, + thai_punctuations, +) from pythainlp.transliterate import pronunciate from pythainlp.util.syllable import tone_detector @@ -182,3 +193,74 @@ def thai_word_tone_detector(word: str) -> Tuple[str, str]: """ _pronunciate = pronunciate(word).split("-") return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate] + + +def count_thai_chars(text: str) -> dict: + """ + Count Thai characters by type + + This function will give you numbers of Thai characters by type\ + (consonants, vowels, lead_vowels, follow_vowels, above_vowels,\ + below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai) + + :param str text: Text + :return: Dict with numbers of Thai characters by type + :rtype: dict + + :Example: + :: + + from pythainlp.util import count_thai_chars + + count_thai_chars("ทดสอบภาษาไทย") + # output: { + # 'vowels': 3, + # 'lead_vowels': 1, + # 'follow_vowels': 2, + # 'above_vowels': 0, + # 'below_vowels': 0, + # 'consonants': 9, + # 'tonemarks': 0, + # 'signs': 0, + # 'thai_digits': 0, + # 'punctuations': 0, + # 'non_thai': 0 + # } + """ + _dict = { + "vowels": 0, + "lead_vowels":0, + "follow_vowels":0, + "above_vowels":0, + "below_vowels":0, + "consonants":0, + "tonemarks":0, + "signs":0, + "thai_digits":0, + "punctuations":0, + "non_thai":0 + } + for c in text: + if c in thai_vowels: + _dict["vowels"]+=1 + if c in thai_lead_vowels: + _dict["lead_vowels"]+=1 + elif c in thai_follow_vowels: + _dict["follow_vowels"]+=1 + elif c in thai_above_vowels: + _dict["above_vowels"]+=1 + elif c in thai_below_vowels: + _dict["below_vowels"]+=1 + elif c in thai_consonants: + _dict["consonants"]+=1 + elif c in thai_tonemarks: + _dict["tonemarks"]+=1 + elif c in thai_signs: + _dict["signs"]+=1 + elif c in thai_digits: + _dict["thai_digits"]+=1 + elif c in thai_punctuations: + _dict["punctuations"]+=1 + else: + _dict["non_thai"]+=1 + return _dict diff --git a/tests/test_util.py b/tests/test_util.py index 442fb12ad..8e4b2f2a4 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -15,6 +15,7 @@ bahttext, collate, countthai, + count_thai_chars, dict_trie, display_thai_char, digit_to_text, @@ -595,6 +596,40 @@ def test_countthai(self): self.assertEqual(countthai("(กกต.)", ".()"), 100.0) self.assertEqual(countthai("(กกต.)", None), 50.0) + def test_count_thai_chars(self): + self.assertEquals( + count_thai_chars("ทดสอบภาษาไทย"), + { + 'vowels': 3, + 'lead_vowels': 1, + 'follow_vowels': 2, + 'above_vowels': 0, + 'below_vowels': 0, + 'consonants': 9, + 'tonemarks': 0, + 'signs': 0, + 'thai_digits': 0, + 'punctuations': 0, + 'non_thai': 0, + } + ) + self.assertEquals( + count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"), + { + 'vowels': 13, + 'lead_vowels': 6, + 'follow_vowels': 1, + 'above_vowels': 5, + 'below_vowels': 1, + 'consonants': 22, + 'tonemarks': 3, + 'signs': 2, + 'thai_digits': 1, + 'punctuations': 1, + 'non_thai': 4, + } + ) + def test_isthaichar(self): self.assertEqual(isthaichar("ก"), True) self.assertEqual(isthaichar("a"), False) From c8b8f0f6e5e5e27c1a406afb37ab217736e8593c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 27 Oct 2022 22:02:07 +0700 Subject: [PATCH 2/3] Fixed PEP8 --- pythainlp/util/thai.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index aa56f360e..ea770f0b3 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -229,38 +229,38 @@ def count_thai_chars(text: str) -> dict: """ _dict = { "vowels": 0, - "lead_vowels":0, - "follow_vowels":0, - "above_vowels":0, - "below_vowels":0, - "consonants":0, - "tonemarks":0, - "signs":0, - "thai_digits":0, - "punctuations":0, - "non_thai":0 + "lead_vowels": 0, + "follow_vowels": 0, + "above_vowels": 0, + "below_vowels": 0, + "consonants": 0, + "tonemarks": 0, + "signs": 0, + "thai_digits": 0, + "punctuations": 0, + "non_thai": 0, } for c in text: if c in thai_vowels: - _dict["vowels"]+=1 + _dict["vowels"] += 1 if c in thai_lead_vowels: - _dict["lead_vowels"]+=1 + _dict["lead_vowels"] += 1 elif c in thai_follow_vowels: - _dict["follow_vowels"]+=1 + _dict["follow_vowels"] += 1 elif c in thai_above_vowels: - _dict["above_vowels"]+=1 + _dict["above_vowels"] += 1 elif c in thai_below_vowels: - _dict["below_vowels"]+=1 + _dict["below_vowels"] += 1 elif c in thai_consonants: - _dict["consonants"]+=1 + _dict["consonants"] += 1 elif c in thai_tonemarks: - _dict["tonemarks"]+=1 + _dict["tonemarks"] += 1 elif c in thai_signs: - _dict["signs"]+=1 + _dict["signs"] += 1 elif c in thai_digits: - _dict["thai_digits"]+=1 + _dict["thai_digits"] += 1 elif c in thai_punctuations: - _dict["punctuations"]+=1 + _dict["punctuations"] += 1 else: - _dict["non_thai"]+=1 + _dict["non_thai"] += 1 return _dict From 360605b1bedb90049ac7a69f36738c39c23f7f98 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 27 Oct 2022 22:31:56 +0700 Subject: [PATCH 3/3] Update test_util.py --- tests/test_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 8e4b2f2a4..342ba0cca 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -616,10 +616,10 @@ def test_count_thai_chars(self): self.assertEquals( count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"), { - 'vowels': 13, + 'vowels': 12, 'lead_vowels': 6, 'follow_vowels': 1, - 'above_vowels': 5, + 'above_vowels': 4, 'below_vowels': 1, 'consonants': 22, 'tonemarks': 3,