diff --git a/docker_requirements.txt b/docker_requirements.txt index b6533a975..56cad2d65 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -29,5 +29,5 @@ OSKut==1.3 nlpo3==1.2.2 thai-nner==0.3 spacy==2.3.* -wunsen==0.0.1 +wunsen==0.0.3 khanaa==0.0.6 diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 0c6251534..4034d3910 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -4,7 +4,7 @@ # Copyright (C) 2016-2022 PyThaiNLP Project # URL: # For license information, see LICENSE -__version__ = "3.1.0-dev0" +__version__ = "3.1.0-dev1" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py index 9c08a0452..28aa0770c 100644 --- a/pythainlp/transliterate/wunsen.py +++ b/pythainlp/transliterate/wunsen.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -Transliterating Japanese/Korean/Vietnamese romanization text to Thai text +Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text +to Thai text By Wunsen :See Also: @@ -12,25 +13,40 @@ class WunsenTransliterate: """ - Transliterating Japanese/Korean/Vietnamese romanization text to Thai text + Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text + to Thai text by Wunsen :See Also: * `GitHub \ `_ """ + def __init__(self) -> None: self.thap_value = None self.lang = None self.jp_input = None + self.zh_sandhi = None + self.system = None - def transliterate(self, text: str, lang: str, jp_input: str = None): + def transliterate( + self, + text: str, + lang: str, + jp_input: str = None, + zh_sandhi: bool = None, + system: str = None, + ): """ Use Wunsen for transliteration :param str text: text wants transliterated to Thai text. :param str lang: source language :param str jp_input: japanese input method (for japanese only) + :param bool zh_sandhi: mandarin third tone sandhi option + (for mandarin only) + :param str system: transliteration system (for japanese and + mandarin only) :return: Thai text :rtype: str @@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None): * *jp* - Japanese (from Hepburn romanization) * *ko* - Korean (from Revised Romanization) * *vi* - Vietnamese (Latin script) + * *zh* - Mandarin (from Hanyu Pinyin) :Options for jp_input: * *Hepburn-no diacritic* - Hepburn-no diacritic (without macron) + :Options for zh_sandhi: + * *True* - apply third tone sandhi rule + * *False* - do not apply third tone sandhi rule + :Options for system: + * *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) + * *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (ราชบัณฑิตยสถาน พ.ศ. 2535) + * *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน + (ราชบัณฑิตยสถาน พ.ศ. 2549) + * *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน + ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร + ภาษาจีน พ.ศ. 2543) :Example: :: @@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None): ) # output: 'โอฮาโย' + wt.transliterate("ohayō", lang="jp", system="RI35") + # output: 'โอะฮะโย' + wt.transliterate("annyeonghaseyo", lang="ko") # output: 'อันนย็องฮาเซโย' wt.transliterate("xin chào", lang="vi") # output: 'ซีน จ่าว' + + wt.transliterate("ni3 hao3", lang="zh") + # output: 'หนี เห่า' + + wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False) + # output: 'หนี่ เห่า' + + wt.transliterate("ni3 hao3", lang="zh", system="RI49") + # output: 'หนี ห่าว' """ - if self.lang != lang or self.jp_input != jp_input: + if ( + self.lang != lang + or self.jp_input != jp_input + or self.zh_sandhi != zh_sandhi + or self.system != system + ): if lang == "jp": - if jp_input is None: - self.thap_value = ThapSap("ja") - else: - self.thap_value = ThapSap("ja", input=jp_input) self.jp_input = jp_input + self.zh_sandhi = None + self.system = system + elif lang == "zh": + self.jp_input = None + self.zh_sandhi = zh_sandhi + self.system = system elif lang == "ko" or lang == "vi": self.jp_input = None - self.thap_value = ThapSap(lang) + self.zh_sandhi = None + self.system = None else: raise NotImplementedError( "The %s language is not implemented." % lang ) + self.lang = lang + input_lang = lang + if input_lang == "jp": + input_lang = "ja" + setting = {} + if self.jp_input is not None: + setting.update({"input": self.jp_input}) + if self.zh_sandhi is not None: + setting.update({"option": {"sandhi": self.zh_sandhi}}) + if self.system is not None: + setting.update({"system": self.system}) + self.thap_value = ThapSap(input_lang, **setting) return self.thap_value.thap(text) diff --git a/setup.cfg b/setup.cfg index bea023df1..14aa60ca5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.8 +current_version = 3.1.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 78509b9c2..1bcb1d4c7 100644 --- a/setup.py +++ b/setup.py @@ -108,13 +108,13 @@ "nlpo3>=1.2.2", "onnxruntime>=1.10.0", "thai_nner", - "wunsen>=0.0.1" + "wunsen>=0.0.3" ], } setup( name="pythainlp", - version="3.1.0-dev0", + version="3.1.0-dev1", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 0ac93bc66..d2d30e0c2 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -171,6 +171,10 @@ def test_transliterate_wunsen(self): ), 'โอฮาโย' ) + self.assertEqual( + wt.transliterate("ohayō", lang="jp", system="RI35"), + 'โอะฮะโย' + ) self.assertEqual( wt.transliterate("annyeonghaseyo", lang="ko"), 'อันนย็องฮาเซโย' @@ -179,6 +183,18 @@ def test_transliterate_wunsen(self): wt.transliterate("xin chào", lang="vi"), 'ซีน จ่าว' ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh"), + 'หนี เห่า' + ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False), + 'หนี่ เห่า' + ) + self.assertEqual( + wt.transliterate("ni3 hao3", lang="zh", system="RI49"), + 'หนี ห่าว' + ) with self.assertRaises(NotImplementedError): wt.transliterate("xin chào", lang="vii")