diff --git a/docker_requirements.txt b/docker_requirements.txt
index b6533a975..56cad2d65 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -29,5 +29,5 @@ OSKut==1.3
nlpo3==1.2.2
thai-nner==0.3
spacy==2.3.*
-wunsen==0.0.1
+wunsen==0.0.3
khanaa==0.0.6
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
index 0c6251534..4034d3910 100644
--- a/pythainlp/__init__.py
+++ b/pythainlp/__init__.py
@@ -4,7 +4,7 @@
# Copyright (C) 2016-2022 PyThaiNLP Project
# URL:
# For license information, see LICENSE
-__version__ = "3.1.0-dev0"
+__version__ = "3.1.0-dev1"
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py
index 9c08a0452..28aa0770c 100644
--- a/pythainlp/transliterate/wunsen.py
+++ b/pythainlp/transliterate/wunsen.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""
-Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
+Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
+to Thai text
By Wunsen
:See Also:
@@ -12,25 +13,40 @@
class WunsenTransliterate:
"""
- Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
+ Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
+ to Thai text
by Wunsen
:See Also:
* `GitHub \
`_
"""
+
def __init__(self) -> None:
self.thap_value = None
self.lang = None
self.jp_input = None
+ self.zh_sandhi = None
+ self.system = None
- def transliterate(self, text: str, lang: str, jp_input: str = None):
+ def transliterate(
+ self,
+ text: str,
+ lang: str,
+ jp_input: str = None,
+ zh_sandhi: bool = None,
+ system: str = None,
+ ):
"""
Use Wunsen for transliteration
:param str text: text wants transliterated to Thai text.
:param str lang: source language
:param str jp_input: japanese input method (for japanese only)
+ :param bool zh_sandhi: mandarin third tone sandhi option
+ (for mandarin only)
+ :param str system: transliteration system (for japanese and
+ mandarin only)
:return: Thai text
:rtype: str
@@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
* *jp* - Japanese (from Hepburn romanization)
* *ko* - Korean (from Revised Romanization)
* *vi* - Vietnamese (Latin script)
+ * *zh* - Mandarin (from Hanyu Pinyin)
:Options for jp_input:
* *Hepburn-no diacritic* - Hepburn-no diacritic (without macron)
+ :Options for zh_sandhi:
+ * *True* - apply third tone sandhi rule
+ * *False* - do not apply third tone sandhi rule
+ :Options for system:
+ * *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
+ (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
+ * *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
+ (ราชบัณฑิตยสถาน พ.ศ. 2535)
+ * *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน
+ (ราชบัณฑิตยสถาน พ.ศ. 2549)
+ * *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน
+ ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร
+ ภาษาจีน พ.ศ. 2543)
:Example:
::
@@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
)
# output: 'โอฮาโย'
+ wt.transliterate("ohayō", lang="jp", system="RI35")
+ # output: 'โอะฮะโย'
+
wt.transliterate("annyeonghaseyo", lang="ko")
# output: 'อันนย็องฮาเซโย'
wt.transliterate("xin chào", lang="vi")
# output: 'ซีน จ่าว'
+
+ wt.transliterate("ni3 hao3", lang="zh")
+ # output: 'หนี เห่า'
+
+ wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
+ # output: 'หนี่ เห่า'
+
+ wt.transliterate("ni3 hao3", lang="zh", system="RI49")
+ # output: 'หนี ห่าว'
"""
- if self.lang != lang or self.jp_input != jp_input:
+ if (
+ self.lang != lang
+ or self.jp_input != jp_input
+ or self.zh_sandhi != zh_sandhi
+ or self.system != system
+ ):
if lang == "jp":
- if jp_input is None:
- self.thap_value = ThapSap("ja")
- else:
- self.thap_value = ThapSap("ja", input=jp_input)
self.jp_input = jp_input
+ self.zh_sandhi = None
+ self.system = system
+ elif lang == "zh":
+ self.jp_input = None
+ self.zh_sandhi = zh_sandhi
+ self.system = system
elif lang == "ko" or lang == "vi":
self.jp_input = None
- self.thap_value = ThapSap(lang)
+ self.zh_sandhi = None
+ self.system = None
else:
raise NotImplementedError(
"The %s language is not implemented." % lang
)
+ self.lang = lang
+ input_lang = lang
+ if input_lang == "jp":
+ input_lang = "ja"
+ setting = {}
+ if self.jp_input is not None:
+ setting.update({"input": self.jp_input})
+ if self.zh_sandhi is not None:
+ setting.update({"option": {"sandhi": self.zh_sandhi}})
+ if self.system is not None:
+ setting.update({"system": self.system})
+ self.thap_value = ThapSap(input_lang, **setting)
return self.thap_value.thap(text)
diff --git a/setup.cfg b/setup.cfg
index bea023df1..14aa60ca5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 3.0.8
+current_version = 3.1.0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 78509b9c2..1bcb1d4c7 100644
--- a/setup.py
+++ b/setup.py
@@ -108,13 +108,13 @@
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner",
- "wunsen>=0.0.1"
+ "wunsen>=0.0.3"
],
}
setup(
name="pythainlp",
- version="3.1.0-dev0",
+ version="3.1.0-dev1",
description="Thai Natural Language Processing library",
long_description=readme,
long_description_content_type="text/markdown",
diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py
index 0ac93bc66..d2d30e0c2 100644
--- a/tests/test_transliterate.py
+++ b/tests/test_transliterate.py
@@ -171,6 +171,10 @@ def test_transliterate_wunsen(self):
),
'โอฮาโย'
)
+ self.assertEqual(
+ wt.transliterate("ohayō", lang="jp", system="RI35"),
+ 'โอะฮะโย'
+ )
self.assertEqual(
wt.transliterate("annyeonghaseyo", lang="ko"),
'อันนย็องฮาเซโย'
@@ -179,6 +183,18 @@ def test_transliterate_wunsen(self):
wt.transliterate("xin chào", lang="vi"),
'ซีน จ่าว'
)
+ self.assertEqual(
+ wt.transliterate("ni3 hao3", lang="zh"),
+ 'หนี เห่า'
+ )
+ self.assertEqual(
+ wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False),
+ 'หนี่ เห่า'
+ )
+ self.assertEqual(
+ wt.transliterate("ni3 hao3", lang="zh", system="RI49"),
+ 'หนี ห่าว'
+ )
with self.assertRaises(NotImplementedError):
wt.transliterate("xin chào", lang="vii")