From 0a3c0aa58d6247e7d4372d6fde8af3cb16401975 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 19 Oct 2023 13:57:24 +0100 Subject: [PATCH 1/3] Remove duplicate key --- pythainlp/transliterate/iso_11940.py | 13 ++++++++++--- setup.cfg | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pythainlp/transliterate/iso_11940.py b/pythainlp/transliterate/iso_11940.py index 2fbb135ce..3d1466445 100644 --- a/pythainlp/transliterate/iso_11940.py +++ b/pythainlp/transliterate/iso_11940.py @@ -107,10 +107,17 @@ _punctuation_and_digits = { "ๆ": "«", - "ฯ": "ǂ", + "ฯ": "ǂ", # paiyan noi: U+01C2 ǂ Alveolar Click; ICU uses ‡ (double dagger) "๏": "§", - "ฯ": "ǀ", - "๚": "ǁ", +# ฯ can has two meanings in ISO 11940. +# If it is for abbrevation, it is paiyan noi. +# If it is for sentence termination, it is angkhan diao. +# Without semantic analysis, they cannot be distinguished from each other. +# In this simple implementation, we decided to always treat ฯ as paiyan noi. +# We commented out angkhan diao line below to remove it from the dictionary +# to avoid duplice keys. +# "ฯ": "ǀ", # angkhan diao: U+01C0 ǀ Dental Click; ICU uses | (vertical bar) + "๚": "ǁ", # angkhan khu: U+01C1 ǁ Lateral Click; ICU uses || (two vertical bars) "๛": "»", "๐": "0", "๑": "1", diff --git a/setup.cfg b/setup.cfg index 2bf56e01a..c203b8a91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ search = __version__ = "{current_version}" replace = __version__ = "{new_version}" [metadata] -description-file = README.md +description_file = README.md [coverage:run] source = pythainlp From 1bf6eb3c343f0e51efb8024607fe5294a4b650f0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 19 Oct 2023 14:12:48 +0100 Subject: [PATCH 2/3] Revise docstring and variable names Co-authored-by: BLKSerene --- pythainlp/transliterate/iso_11940.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pythainlp/transliterate/iso_11940.py b/pythainlp/transliterate/iso_11940.py index 3d1466445..3486f74a7 100644 --- a/pythainlp/transliterate/iso_11940.py +++ b/pythainlp/transliterate/iso_11940.py @@ -137,19 +137,19 @@ **_tone_marks, **_punctuation_and_digits, } -_list_k = _all_dict.keys() +_keys_set = _all_dict.keys() def transliterate(word: str) -> str: """ Use ISO 11940 for transliteration :param str text: Thai text to be transliterated. - :return: A string of IPA indicating how the text should be pronounced. + :return: A string indicating how the text should be pronounced, according to ISO 11940. """ - _new = "" + _str = "" for i in word: - if i in _list_k: - _new += _all_dict[i] + if i in _keys_set: + _str += _all_dict[i] else: - _new += i - return _new + _str += i + return _str From dbd2e6656d222a62b3e28b94033ad9ff726532ec Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 19 Oct 2023 14:32:39 +0100 Subject: [PATCH 3/3] Fix code indent Co-authored-by: BLKSerene --- pythainlp/transliterate/iso_11940.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pythainlp/transliterate/iso_11940.py b/pythainlp/transliterate/iso_11940.py index 4cd0adae3..82603225e 100644 --- a/pythainlp/transliterate/iso_11940.py +++ b/pythainlp/transliterate/iso_11940.py @@ -106,17 +106,17 @@ } _punctuation_and_digits = { + # ฯ can has two meanings in ISO 11940. + # If it is for abbrevation, it is paiyan noi. + # If it is for sentence termination, it is angkhan diao. + # Without semantic analysis, they cannot be distinguished from each other. + # In this simple implementation, we decided to always treat ฯ as paiyan noi. + # We commented out angkhan diao line to remove it from the dictionary + # and avoid having duplicate keys. "ๆ": "«", "ฯ": "ǂ", # paiyan noi: U+01C2 ǂ Alveolar Click; ICU uses ‡ (double dagger) "๏": "§", -# ฯ can has two meanings in ISO 11940. -# If it is for abbrevation, it is paiyan noi. -# If it is for sentence termination, it is angkhan diao. -# Without semantic analysis, they cannot be distinguished from each other. -# In this simple implementation, we decided to always treat ฯ as paiyan noi. -# We commented out angkhan diao line below to remove it from the dictionary -# to avoid duplice keys. -# "ฯ": "ǀ", # angkhan diao: U+01C0 ǀ Dental Click; ICU uses | (vertical bar) + # "ฯ": "ǀ", # angkhan diao: U+01C0 ǀ Dental Click; ICU uses | (vertical bar) "๚": "ǁ", # angkhan khu: U+01C1 ǁ Lateral Click; ICU uses || (two vertical bars) "๛": "»", "๐": "0",