From 2ce76ad4855bbc7fc8ac33e6847297dd78347bd5 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Wed, 7 May 2025 12:02:18 +0700 Subject: [PATCH 1/3] Update romanize docs and keep space --- pythainlp/transliterate/core.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index d5b656ff2..ac725452b 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -9,22 +9,24 @@ def romanize( - text: str, + word: str, engine: str = DEFAULT_ROMANIZE_ENGINE, fallback_engine: str = DEFAULT_ROMANIZE_ENGINE, ) -> str: """ - This function renders Thai words in the Latin alphabet or "romanization", + This function renders Thai word in the Latin alphabet or "romanization", using the Royal Thai General System of Transcription (RTGS) [#rtgs_transcription]_. RTGS is the official system published by the Royal Institute of Thailand. (Thai: ถอดเสียงภาษาไทยเป็นอักษรละติน) - :param str text: Thai text to be romanized + :param str word: A Thai word to be romanized. \ + The input should not include whitespace because \ + the function is support subwords by spliting whitespace. :param str engine: One of 'royin' (default), 'thai2rom', 'thai2rom_onnx, 'tltk', and 'lookup'. See more in options for engine section. :param str fallback_engine: If engine equals 'lookup', use `fallback_engine` for words that are not in the transliteration dict. No effect on other engines. Default to 'royin'. - :return: A string of Thai words rendered in the Latin alphabet. + :return: A string of a Thai word rendered in the Latin alphabet. () :rtype: str :Options for engines: @@ -53,6 +55,9 @@ def romanize( romanize("ภาพยนตร์", engine="royin") # output: 'phapn' + romanize("รส ดี", engine="royin") # subwords + # output: 'rot di' + romanize("ภาพยนตร์", engine="thai2rom") # output: 'phapphayon' @@ -76,20 +81,20 @@ def select_romanize_engine(engine: str): return romanize - if not text or not isinstance(text, str): + if not word or not isinstance(word, str): return "" if engine == "lookup": from pythainlp.transliterate.lookup import romanize fallback = select_romanize_engine(fallback_engine) - return romanize(text, fallback_func=fallback) + return romanize(word, fallback_func=fallback) else: rom_engine = select_romanize_engine(engine) trans_word = [] - for word in text.split(' '): - trans_word.append(rom_engine(word)) - new_word = ''.join(trans_word) + for subword in word.split(' '): + trans_word.append(rom_engine(subword)) + new_word = ' '.join(trans_word) return new_word From 415d00e3bbf7cd4f94f688c2ce0dade13e69351b Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Wed, 7 May 2025 12:04:15 +0700 Subject: [PATCH 2/3] Update romanize docs --- pythainlp/transliterate/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index ac725452b..992d8b2db 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -26,7 +26,7 @@ def romanize( :param str fallback_engine: If engine equals 'lookup', use `fallback_engine` for words that are not in the transliteration dict. No effect on other engines. Default to 'royin'. - :return: A string of a Thai word rendered in the Latin alphabet. () + :return: A string of a Thai word rendered in the Latin alphabet. :rtype: str :Options for engines: From 4cb0c3231c780d5feaf7d1af5bbac7318101be18 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Thu, 8 May 2025 13:01:19 +0700 Subject: [PATCH 3/3] Fixed romanize function --- pythainlp/transliterate/core.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index 992d8b2db..315343661 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -9,7 +9,7 @@ def romanize( - word: str, + text: str, engine: str = DEFAULT_ROMANIZE_ENGINE, fallback_engine: str = DEFAULT_ROMANIZE_ENGINE, ) -> str: @@ -19,7 +19,7 @@ def romanize( [#rtgs_transcription]_. RTGS is the official system published by the Royal Institute of Thailand. (Thai: ถอดเสียงภาษาไทยเป็นอักษรละติน) - :param str word: A Thai word to be romanized. \ + :param str text: A Thai word to be romanized. \ The input should not include whitespace because \ the function is support subwords by spliting whitespace. :param str engine: One of 'royin' (default), 'thai2rom', 'thai2rom_onnx, 'tltk', and 'lookup'. See more in options for engine section. @@ -81,18 +81,18 @@ def select_romanize_engine(engine: str): return romanize - if not word or not isinstance(word, str): + if not text or not isinstance(text, str): return "" if engine == "lookup": from pythainlp.transliterate.lookup import romanize fallback = select_romanize_engine(fallback_engine) - return romanize(word, fallback_func=fallback) + return romanize(text, fallback_func=fallback) else: rom_engine = select_romanize_engine(engine) trans_word = [] - for subword in word.split(' '): + for subword in text.split(' '): trans_word.append(rom_engine(subword)) new_word = ' '.join(trans_word) return new_word