Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@ OSKut==1.3
nlpo3==1.2.2
thai-nner==0.3
spacy==2.3.*
wunsen==0.0.1
khanaa==0.0.6
1 change: 1 addition & 0 deletions docs/api/transliterate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Modules
.. autofunction:: transliterate
.. autofunction:: pronunciate
.. autofunction:: puan
.. automodule:: pythainlp.transliterate.wunsen.WunsenTransliterate

Romanize Engines
----------------
Expand Down
81 changes: 81 additions & 0 deletions pythainlp/transliterate/wunsen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
By Wunsen

:See Also:
* `GitHub \
<https://github.com/cakimpei/wunsen>`_
"""
from wunsen import ThapSap


class WunsenTransliterate:
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
by Wunsen

:See Also:
* `GitHub \
<https://github.com/cakimpei/wunsen>`_
"""
def __init__(self) -> None:
self.thap_value = None
self.lang = None
self.jp_input = None

def transliterate(self, text: str, lang: str, jp_input: str = None):
"""
Use Wunsen for transliteration

:param str text: text wants transliterated to Thai text.
:param str lang: source language
:param str jp_input: japanese input method (for japanese only)

:return: Thai text
:rtype: str

:Options for lang:
* *jp* - Japanese (from Hepburn romanization)
* *ko* - Korean (from Revised Romanization)
* *vi* - Vietnamese (Latin script)
:Options for jp_input:
* *Hepburn-no diacritic* - Hepburn-no diacritic (without macron)

:Example:
::
from pythainlp.transliterate.wunsen import WunsenTransliterate

wt = WunsenTransliterate()

wt.transliterate("ohayō", lang="jp")
# output: 'โอฮาโย'

wt.transliterate(
"ohayou",
lang="jp",
jp_input="Hepburn-no diacritic"
)
# output: 'โอฮาโย'

wt.transliterate("annyeonghaseyo", lang="ko")
# output: 'อันนย็องฮาเซโย'

wt.transliterate("xin chào", lang="vi")
# output: 'ซีน จ่าว'
"""
if self.lang != lang or self.jp_input != jp_input:
if lang == "jp":
if jp_input is None:
self.thap_value = ThapSap("ja")
else:
self.thap_value = ThapSap("ja", input=jp_input)
self.jp_input = jp_input
elif lang == "ko" or lang == "vi":
self.jp_input = None
self.thap_value = ThapSap(lang)
else:
raise NotImplementedError(
"The %s language is not implemented." % lang
)
return self.thap_value.thap(text)
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"torch>=1.0.0",
"transformers>=4.6.0",
],
"wunsen": ["wunsen>=0.0.1"],
"textaugment": [
"bpemb",
"gensim>=4.0.0"
Expand Down Expand Up @@ -107,7 +108,8 @@
"oskut>=1.3",
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner"
"thai_nner",
"wunsen>=0.0.1"
],
}

Expand Down
26 changes: 26 additions & 0 deletions tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pythainlp.transliterate import romanize, transliterate, pronunciate, puan
from pythainlp.transliterate.ipa import trans_list, xsampa_list
from pythainlp.transliterate.thai2rom import ThaiTransliterator
from pythainlp.transliterate.wunsen import WunsenTransliterate
from pythainlp.corpus import remove

_BASIC_TESTS = {
Expand Down Expand Up @@ -156,6 +157,31 @@ def test_transliterate_iso11940(self):
"p̣hās̛̄āịthy"
)

def test_transliterate_wunsen(self):
wt = WunsenTransliterate()
self.assertEqual(
wt.transliterate("ohayō", lang="jp"),
'โอฮาโย'
)
self.assertEqual(
wt.transliterate(
"ohayou",
lang="jp",
jp_input="Hepburn-no diacritic"
),
'โอฮาโย'
)
self.assertEqual(
wt.transliterate("annyeonghaseyo", lang="ko"),
'อันนย็องฮาเซโย'
)
self.assertEqual(
wt.transliterate("xin chào", lang="vi"),
'ซีน จ่าว'
)
with self.assertRaises(NotImplementedError):
wt.transliterate("xin chào", lang="vii")

def test_pronunciate(self):
self.assertEqual(pronunciate(""), "")
remove("thai_w2p")
Expand Down