Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ OSKut==1.3
nlpo3==1.2.2
thai-nner==0.3
spacy==2.3.*
wunsen==0.0.1
wunsen==0.0.3
khanaa==0.0.6
2 changes: 1 addition & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Copyright (C) 2016-2022 PyThaiNLP Project
# URL: <https://pythainlp.github.io/>
# For license information, see LICENSE
__version__ = "3.1.0-dev0"
__version__ = "3.1.0-dev1"

thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars

Expand Down
80 changes: 71 additions & 9 deletions pythainlp/transliterate/wunsen.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
to Thai text
By Wunsen

:See Also:
Expand All @@ -12,25 +13,40 @@

class WunsenTransliterate:
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
to Thai text
by Wunsen

:See Also:
* `GitHub \
<https://github.com/cakimpei/wunsen>`_
"""

def __init__(self) -> None:
self.thap_value = None
self.lang = None
self.jp_input = None
self.zh_sandhi = None
self.system = None

def transliterate(self, text: str, lang: str, jp_input: str = None):
def transliterate(
self,
text: str,
lang: str,
jp_input: str = None,
zh_sandhi: bool = None,
system: str = None,
):
"""
Use Wunsen for transliteration

:param str text: text wants transliterated to Thai text.
:param str lang: source language
:param str jp_input: japanese input method (for japanese only)
:param bool zh_sandhi: mandarin third tone sandhi option
(for mandarin only)
:param str system: transliteration system (for japanese and
mandarin only)

:return: Thai text
:rtype: str
Expand All @@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
* *jp* - Japanese (from Hepburn romanization)
* *ko* - Korean (from Revised Romanization)
* *vi* - Vietnamese (Latin script)
* *zh* - Mandarin (from Hanyu Pinyin)
:Options for jp_input:
* *Hepburn-no diacritic* - Hepburn-no diacritic (without macron)
:Options for zh_sandhi:
* *True* - apply third tone sandhi rule
* *False* - do not apply third tone sandhi rule
:Options for system:
* *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
(สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
* *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
(ราชบัณฑิตยสถาน พ.ศ. 2535)
* *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน
(ราชบัณฑิตยสถาน พ.ศ. 2549)
* *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน
ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร
ภาษาจีน พ.ศ. 2543)

:Example:
::
Expand All @@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
)
# output: 'โอฮาโย'

wt.transliterate("ohayō", lang="jp", system="RI35")
# output: 'โอะฮะโย'

wt.transliterate("annyeonghaseyo", lang="ko")
# output: 'อันนย็องฮาเซโย'

wt.transliterate("xin chào", lang="vi")
# output: 'ซีน จ่าว'

wt.transliterate("ni3 hao3", lang="zh")
# output: 'หนี เห่า'

wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
# output: 'หนี่ เห่า'

wt.transliterate("ni3 hao3", lang="zh", system="RI49")
# output: 'หนี ห่าว'
"""
if self.lang != lang or self.jp_input != jp_input:
if (
self.lang != lang
or self.jp_input != jp_input
or self.zh_sandhi != zh_sandhi
or self.system != system
):
if lang == "jp":
if jp_input is None:
self.thap_value = ThapSap("ja")
else:
self.thap_value = ThapSap("ja", input=jp_input)
self.jp_input = jp_input
self.zh_sandhi = None
self.system = system
elif lang == "zh":
self.jp_input = None
self.zh_sandhi = zh_sandhi
self.system = system
elif lang == "ko" or lang == "vi":
self.jp_input = None
self.thap_value = ThapSap(lang)
self.zh_sandhi = None
self.system = None
else:
raise NotImplementedError(
"The %s language is not implemented." % lang
)
self.lang = lang
input_lang = lang
if input_lang == "jp":
input_lang = "ja"
setting = {}
if self.jp_input is not None:
setting.update({"input": self.jp_input})
if self.zh_sandhi is not None:
setting.update({"option": {"sandhi": self.zh_sandhi}})
if self.system is not None:
setting.update({"system": self.system})
self.thap_value = ThapSap(input_lang, **setting)
return self.thap_value.thap(text)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.0.8
current_version = 3.1.0
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,13 @@
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.1"
"wunsen>=0.0.3"
],
}

setup(
name="pythainlp",
version="3.1.0-dev0",
version="3.1.0-dev1",
description="Thai Natural Language Processing library",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
16 changes: 16 additions & 0 deletions tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def test_transliterate_wunsen(self):
),
'โอฮาโย'
)
self.assertEqual(
wt.transliterate("ohayō", lang="jp", system="RI35"),
'โอะฮะโย'
)
self.assertEqual(
wt.transliterate("annyeonghaseyo", lang="ko"),
'อันนย็องฮาเซโย'
Expand All @@ -179,6 +183,18 @@ def test_transliterate_wunsen(self):
wt.transliterate("xin chào", lang="vi"),
'ซีน จ่าว'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh"),
'หนี เห่า'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False),
'หนี่ เห่า'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh", system="RI49"),
'หนี ห่าว'
)
with self.assertRaises(NotImplementedError):
wt.transliterate("xin chào", lang="vii")

Expand Down