diff --git a/docker_requirements.txt b/docker_requirements.txt index 642f30039..80a7e9748 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -30,3 +30,5 @@ OSKut==1.3 nlpo3==1.2.2 thai-nner==0.3 spacy==2.3.* +wunsen==0.0.1 +khanaa==0.0.6 diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst index 83107b932..14f58f969 100644 --- a/docs/api/transliterate.rst +++ b/docs/api/transliterate.rst @@ -11,6 +11,7 @@ Modules .. autofunction:: transliterate .. autofunction:: pronunciate .. autofunction:: puan +.. automodule:: pythainlp.transliterate.wunsen.WunsenTransliterate Romanize Engines ---------------- diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py new file mode 100644 index 000000000..9c08a0452 --- /dev/null +++ b/pythainlp/transliterate/wunsen.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Transliterating Japanese/Korean/Vietnamese romanization text to Thai text +By Wunsen + +:See Also: + * `GitHub \ + `_ +""" +from wunsen import ThapSap + + +class WunsenTransliterate: + """ + Transliterating Japanese/Korean/Vietnamese romanization text to Thai text + by Wunsen + + :See Also: + * `GitHub \ + `_ + """ + def __init__(self) -> None: + self.thap_value = None + self.lang = None + self.jp_input = None + + def transliterate(self, text: str, lang: str, jp_input: str = None): + """ + Use Wunsen for transliteration + + :param str text: text wants transliterated to Thai text. + :param str lang: source language + :param str jp_input: japanese input method (for japanese only) + + :return: Thai text + :rtype: str + + :Options for lang: + * *jp* - Japanese (from Hepburn romanization) + * *ko* - Korean (from Revised Romanization) + * *vi* - Vietnamese (Latin script) + :Options for jp_input: + * *Hepburn-no diacritic* - Hepburn-no diacritic (without macron) + + :Example: + :: + from pythainlp.transliterate.wunsen import WunsenTransliterate + + wt = WunsenTransliterate() + + wt.transliterate("ohayō", lang="jp") + # output: 'โอฮาโย' + + wt.transliterate( + "ohayou", + lang="jp", + jp_input="Hepburn-no diacritic" + ) + # output: 'โอฮาโย' + + wt.transliterate("annyeonghaseyo", lang="ko") + # output: 'อันนย็องฮาเซโย' + + wt.transliterate("xin chào", lang="vi") + # output: 'ซีน จ่าว' + """ + if self.lang != lang or self.jp_input != jp_input: + if lang == "jp": + if jp_input is None: + self.thap_value = ThapSap("ja") + else: + self.thap_value = ThapSap("ja", input=jp_input) + self.jp_input = jp_input + elif lang == "ko" or lang == "vi": + self.jp_input = None + self.thap_value = ThapSap(lang) + else: + raise NotImplementedError( + "The %s language is not implemented." % lang + ) + return self.thap_value.thap(text) diff --git a/setup.py b/setup.py index 6a4830096..411412f5a 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ "torch>=1.0.0", "transformers>=4.6.0", ], + "wunsen": ["wunsen>=0.0.1"], "textaugment": [ "bpemb", "gensim>=4.0.0" @@ -107,7 +108,8 @@ "oskut>=1.3", "nlpo3>=1.2.2", "onnxruntime>=1.10.0", - "thai_nner" + "thai_nner", + "wunsen>=0.0.1" ], } diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 122a5cc5c..0ac93bc66 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -6,6 +6,7 @@ from pythainlp.transliterate import romanize, transliterate, pronunciate, puan from pythainlp.transliterate.ipa import trans_list, xsampa_list from pythainlp.transliterate.thai2rom import ThaiTransliterator +from pythainlp.transliterate.wunsen import WunsenTransliterate from pythainlp.corpus import remove _BASIC_TESTS = { @@ -156,6 +157,31 @@ def test_transliterate_iso11940(self): "p̣hās̛̄āịthy" ) + def test_transliterate_wunsen(self): + wt = WunsenTransliterate() + self.assertEqual( + wt.transliterate("ohayō", lang="jp"), + 'โอฮาโย' + ) + self.assertEqual( + wt.transliterate( + "ohayou", + lang="jp", + jp_input="Hepburn-no diacritic" + ), + 'โอฮาโย' + ) + self.assertEqual( + wt.transliterate("annyeonghaseyo", lang="ko"), + 'อันนย็องฮาเซโย' + ) + self.assertEqual( + wt.transliterate("xin chào", lang="vi"), + 'ซีน จ่าว' + ) + with self.assertRaises(NotImplementedError): + wt.transliterate("xin chào", lang="vii") + def test_pronunciate(self): self.assertEqual(pronunciate(""), "") remove("thai_w2p")