From 4d6aa9b8df17796081c9a68e175adfc63e44aa92 Mon Sep 17 00:00:00 2001 From: cakimpei <64963958+cakimpei@users.noreply.github.com> Date: Thu, 1 Sep 2022 20:57:47 +0700 Subject: [PATCH 1/2] Add mandarin, new japanese system to wunsen --- docker_requirements.txt | 2 +- pythainlp/transliterate/wunsen.py | 80 +++++++++++++++++++++++++++---- setup.py | 2 +- 3 files changed, 73 insertions(+), 11 deletions(-) diff --git a/docker_requirements.txt b/docker_requirements.txt index b6533a975..56cad2d65 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -29,5 +29,5 @@ OSKut==1.3 nlpo3==1.2.2 thai-nner==0.3 spacy==2.3.* -wunsen==0.0.1 +wunsen==0.0.3 khanaa==0.0.6 diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py index 9c08a0452..28aa0770c 100644 --- a/pythainlp/transliterate/wunsen.py +++ b/pythainlp/transliterate/wunsen.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -Transliterating Japanese/Korean/Vietnamese romanization text to Thai text +Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text +to Thai text By Wunsen :See Also: @@ -12,25 +13,40 @@ class WunsenTransliterate: """ - Transliterating Japanese/Korean/Vietnamese romanization text to Thai text + Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text + to Thai text by Wunsen :See Also: * `GitHub \ `_ """ + def __init__(self) -> None: self.thap_value = None self.lang = None self.jp_input = None + self.zh_sandhi = None + self.system = None - def transliterate(self, text: str, lang: str, jp_input: str = None): + def transliterate( + self, + text: str, + lang: str, + jp_input: str = None, + zh_sandhi: bool = None, + system: str = None, + ): """ Use Wunsen for transliteration :param str text: text wants transliterated to Thai text. :param str lang: source language :param str jp_input: japanese input method (for japanese only) + :param bool zh_sandhi: mandarin third tone sandhi option + (for mandarin only) + :param str system: transliteration system (for japanese and + mandarin only) :return: Thai text :rtype: str @@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None): * *jp* - Japanese (from Hepburn romanization) * *ko* - Korean (from Revised Romanization) * *vi* - Vietnamese (Latin script) + * *zh* - Mandarin (from Hanyu Pinyin) :Options for jp_input: * *Hepburn-no diacritic* - Hepburn-no diacritic (without macron) + :Options for zh_sandhi: + * *True* - apply third tone sandhi rule + * *False* - do not apply third tone sandhi rule + :Options for system: + * *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) + * *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (ราชบัณฑิตยสถาน พ.ศ. 2535) + * *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน + (ราชบัณฑิตยสถาน พ.ศ. 2549) + * *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน + ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร + ภาษาจีน พ.ศ. 2543) :Example: :: @@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None): ) # output: 'โอฮาโย' + wt.transliterate("ohayō", lang="jp", system="RI35") + # output: 'โอะฮะโย' + wt.transliterate("annyeonghaseyo", lang="ko") # output: 'อันนย็องฮาเซโย' wt.transliterate("xin chào", lang="vi") # output: 'ซีน จ่าว' + + wt.transliterate("ni3 hao3", lang="zh") + # output: 'หนี เห่า' + + wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False) + # output: 'หนี่ เห่า' + + wt.transliterate("ni3 hao3", lang="zh", system="RI49") + # output: 'หนี ห่าว' """ - if self.lang != lang or self.jp_input != jp_input: + if ( + self.lang != lang + or self.jp_input != jp_input + or self.zh_sandhi != zh_sandhi + or self.system != system + ): if lang == "jp": - if jp_input is None: - self.thap_value = ThapSap("ja") - else: - self.thap_value = ThapSap("ja", input=jp_input) self.jp_input = jp_input + self.zh_sandhi = None + self.system = system + elif lang == "zh": + self.jp_input = None + self.zh_sandhi = zh_sandhi + self.system = system elif lang == "ko" or lang == "vi": self.jp_input = None - self.thap_value = ThapSap(lang) + self.zh_sandhi = None + self.system = None else: raise NotImplementedError( "The %s language is not implemented." % lang ) + self.lang = lang + input_lang = lang + if input_lang == "jp": + input_lang = "ja" + setting = {} + if self.jp_input is not None: + setting.update({"input": self.jp_input}) + if self.zh_sandhi is not None: + setting.update({"option": {"sandhi": self.zh_sandhi}}) + if self.system is not None: + setting.update({"system": self.system}) + self.thap_value = ThapSap(input_lang, **setting) return self.thap_value.thap(text) diff --git a/setup.py b/setup.py index 78509b9c2..39a616697 100644 --- a/setup.py +++ b/setup.py @@ -108,7 +108,7 @@ "nlpo3>=1.2.2", "onnxruntime>=1.10.0", "thai_nner", - "wunsen>=0.0.1" + "wunsen>=0.0.3" ], } From 4e2b48dbaba367630c339d616077b837321d9fae Mon Sep 17 00:00:00 2001 From: cakimpei <64963958+cakimpei@users.noreply.github.com> Date: Thu, 1 Sep 2022 21:20:29 +0700 Subject: [PATCH 2/2] add tests for wunsen update (zh, ja) --- tests/test_transliterate.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 0ac93bc66..d2d30e0c2 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -171,6 +171,10 @@ def test_transliterate_wunsen(self): ), 'โอฮาโย' ) + self.assertEqual( + wt.transliterate("ohayō", lang="jp", system="RI35"), + 'โอะฮะโย' + ) self.assertEqual( wt.transliterate("annyeonghaseyo", lang="ko"), 'อันนย็องฮาเซโย' @@ -179,6 +183,18 @@ def test_transliterate_wunsen(self): wt.transliterate("xin chào", lang="vi"), 'ซีน จ่าว' ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh"), + 'หนี เห่า' + ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False), + 'หนี่ เห่า' + ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh", system="RI49"), + 'หนี ห่าว' + ) with self.assertRaises(NotImplementedError): wt.transliterate("xin chào", lang="vii")