From 780f7dedb4bad16b4a070bca7d04146f8c7abd12 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jul 2021 15:42:40 +0700 Subject: [PATCH 1/8] Add zh_th --- docs/api/translate.rst | 4 ++ pythainlp/translate/__init__.py | 10 ++- pythainlp/translate/{core.py => en_th.py} | 27 ++++++++ pythainlp/translate/zh_th.py | 76 +++++++++++++++++++++++ setup.py | 5 +- tests/test_translate.py | 19 +++++- 6 files changed, 136 insertions(+), 5 deletions(-) rename pythainlp/translate/{core.py => en_th.py} (85%) create mode 100644 pythainlp/translate/zh_th.py diff --git a/docs/api/translate.rst b/docs/api/translate.rst index 29bb132d6..093fbc38a 100644 --- a/docs/api/translate.rst +++ b/docs/api/translate.rst @@ -12,3 +12,7 @@ Modules :members: translate .. autoclass:: ThEnTranslator :members: translate +.. autoclass:: ThZhTranslator + :members: translate +.. autoclass:: ZhThTranslator + :members: translate diff --git a/pythainlp/translate/__init__.py b/pythainlp/translate/__init__.py index 8664f22ec..86004eaba 100644 --- a/pythainlp/translate/__init__.py +++ b/pythainlp/translate/__init__.py @@ -6,11 +6,17 @@ __all__ = [ "EnThTranslator", "ThEnTranslator", - "download_model_all" + "download_model_all", + "ThZhTranslator", + "ZhThTranslator" ] -from pythainlp.translate.core import ( +from pythainlp.translate.en_th import ( EnThTranslator, ThEnTranslator, download_model_all, ) +from pythainlp.translate.zh_th import ( + ThZhTranslator, + ZhThTranslator, +) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/en_th.py similarity index 85% rename from pythainlp/translate/core.py rename to pythainlp/translate/en_th.py index 72fc9a6a4..8144a6a4a 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/en_th.py @@ -1,4 +1,11 @@ # -*- coding: utf-8 -*- +""" +English-Thai Machine Translation + +from VISTEC-depa Thailand Artificial Intelligence Research Institute + +Website: https://airesearch.in.th/releases/machine-translation-models/ +""" import os import tarfile from collections import defaultdict @@ -71,6 +78,16 @@ def translate(self, text: str) -> str: :param str text: input text in source language :return: translated text in target language :rtype: str + + :Example: + + from pythainlp.translate import EnThTranslator + + enth = EnThTranslator() + + enth.translate("I love cat.") + # output: ฉันรักแมว + """ tokens = " ".join(self._tokenizer.tokenize(text)) translated = self._model.translate(tokens) @@ -110,5 +127,15 @@ def translate(self, text: str) -> str: :param str text: input text in source language :return: translated text in target language :rtype: str + + :Example: + + from pythainlp.translate import ThEnTranslator + + then = ThEnTranslator() + + then.translate("ฉันรักแมว") + # output: I love cat. + """ return self._model.translate(text) diff --git a/pythainlp/translate/zh_th.py b/pythainlp/translate/zh_th.py new file mode 100644 index 000000000..152448dfc --- /dev/null +++ b/pythainlp/translate/zh_th.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +Lalita Chinese-Thai Machine Translation + +from Ai builder + +- GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth +- Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 +""" +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + +class ThZhTranslator: + def __init__(self, pretrained: str = "Lalita/marianmt-th-zh_cn") -> None: + self.tokenizer = AutoTokenizer.from_pretrained(pretrained) + self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + + def translate(self, text: str) -> str: + """ + Translate text from Thai to Chinese + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + from pythainlp.translate import ThZhTranslator + + thzh = ThZhTranslator() + + thzh.translate("ผมรักคุณ") + # output: 我爱你 + + """ + self.translated = self.model.generate( + **self.tokenizer(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer.decode( + t, skip_special_tokens=True + ) for t in self.translated + ][0] + + +class ZhThTranslator: + def __init__(self, pretrained: str = "Lalita/marianmt-zh_cn-th") -> None: + self.tokenizer = AutoTokenizer.from_pretrained(pretrained) + self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + + def translate(self, text: str) -> str: + """ + Translate text from Chinese to Thai + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + from pythainlp.translate import ZhThTranslator + + zhth = ZhThTranslator() + + zhth.translate("我爱你") + # output: ผมรักคุณนะ + + """ + self.translated = self.model.generate( + **self.tokenizer(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer.decode( + t, skip_special_tokens=True + ) for t in self.translated + ][0] diff --git a/setup.py b/setup.py index 0e05f0387..2d2e98302 100644 --- a/setup.py +++ b/setup.py @@ -54,9 +54,10 @@ "sacremoses>=0.0.41", "sentencepiece>=0.1.91", "torch>=1.0.0", + "transformers>=4.6.0", ], "wangchanberta": ["transformers", "sentencepiece"], - "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], + "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "sefr_cut": ["sefr_cut"], "full": [ @@ -74,7 +75,7 @@ "sentencepiece>=0.1.91", "ssg>=0.0.6", "torch>=1.0.0", - "transformers>=4.1.1", + "transformers>=4.6.0", "sefr_cut" ], } diff --git a/tests/test_translate.py b/tests/test_translate.py index f0df58a2a..129d0c31a 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -2,7 +2,12 @@ import unittest -from pythainlp.translate import EnThTranslator, ThEnTranslator +from pythainlp.translate import ( + EnThTranslator, + ThEnTranslator, + ThZhTranslator, + ZhThTranslator +) from pythainlp.translate.core import download_model_all @@ -21,3 +26,15 @@ def test_translate(self): "the cat eats fish.", ) ) + self.th_zh_translator = ThZhTranslator() + self.assertIsNotNone( + self.th_zh_translator.translate( + "ผมรักคุณ", + ) + ) + self.zh_th_translator = ZhThTranslator() + self.assertIsNotNone( + self.zh_th_translator.translate( + "我爱你", + ) + ) From af319c2a4e957b1f5d3418867eb3059ca4ae8363 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jul 2021 15:56:00 +0700 Subject: [PATCH 2/8] Update zh_th.py --- pythainlp/translate/zh_th.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pythainlp/translate/zh_th.py b/pythainlp/translate/zh_th.py index 152448dfc..72d4b8b86 100644 --- a/pythainlp/translate/zh_th.py +++ b/pythainlp/translate/zh_th.py @@ -12,8 +12,8 @@ class ThZhTranslator: def __init__(self, pretrained: str = "Lalita/marianmt-th-zh_cn") -> None: - self.tokenizer = AutoTokenizer.from_pretrained(pretrained) - self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained) + self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained) def translate(self, text: str) -> str: """ @@ -33,11 +33,11 @@ def translate(self, text: str) -> str: # output: 我爱你 """ - self.translated = self.model.generate( - **self.tokenizer(text, return_tensors="pt", padding=True) + self.translated = self.model_thzh.generate( + **self.tokenizer_thzh(text, return_tensors="pt", padding=True) ) return [ - self.tokenizer.decode( + self.tokenizer_thzh.decode( t, skip_special_tokens=True ) for t in self.translated ][0] @@ -45,8 +45,8 @@ def translate(self, text: str) -> str: class ZhThTranslator: def __init__(self, pretrained: str = "Lalita/marianmt-zh_cn-th") -> None: - self.tokenizer = AutoTokenizer.from_pretrained(pretrained) - self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + self.tokenizer_zhth = AutoTokenizer.from_pretrained(pretrained) + self.model_zhth = AutoModelForSeq2SeqLM.from_pretrained(pretrained) def translate(self, text: str) -> str: """ @@ -66,11 +66,11 @@ def translate(self, text: str) -> str: # output: ผมรักคุณนะ """ - self.translated = self.model.generate( - **self.tokenizer(text, return_tensors="pt", padding=True) + self.translated = self.model_zhth.generate( + **self.tokenizer_zhth(text, return_tensors="pt", padding=True) ) return [ - self.tokenizer.decode( + self.tokenizer_zhth.decode( t, skip_special_tokens=True ) for t in self.translated ][0] From 119c8e6e4ec7ff6221ee26ce82c929d506258141 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jul 2021 16:02:36 +0700 Subject: [PATCH 3/8] Update test_translate.py --- tests/test_translate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index 129d0c31a..b9a78c352 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -6,9 +6,9 @@ EnThTranslator, ThEnTranslator, ThZhTranslator, - ZhThTranslator + ZhThTranslator, + download_model_all ) -from pythainlp.translate.core import download_model_all class TestTranslatePackage(unittest.TestCase): From f63c53b7e461c20788127fe116f9c4ca873b0fd2 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 8 Jul 2021 00:01:21 +0700 Subject: [PATCH 4/8] Add Translate class --- docs/api/translate.rst | 2 ++ pythainlp/translate/__init__.py | 5 +++- pythainlp/translate/core.py | 41 +++++++++++++++++++++++++++++++++ tests/test_translate.py | 29 ++++++++++++++++++++++- 4 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 pythainlp/translate/core.py diff --git a/docs/api/translate.rst b/docs/api/translate.rst index 093fbc38a..89b4537ac 100644 --- a/docs/api/translate.rst +++ b/docs/api/translate.rst @@ -16,3 +16,5 @@ Modules :members: translate .. autoclass:: ZhThTranslator :members: translate +.. autoclass:: Translate + :members: diff --git a/pythainlp/translate/__init__.py b/pythainlp/translate/__init__.py index 86004eaba..93a473277 100644 --- a/pythainlp/translate/__init__.py +++ b/pythainlp/translate/__init__.py @@ -8,9 +8,12 @@ "ThEnTranslator", "download_model_all", "ThZhTranslator", - "ZhThTranslator" + "ZhThTranslator", + "Translate" ] +from pythainlp.translate.core import Translate + from pythainlp.translate.en_th import ( EnThTranslator, ThEnTranslator, diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py new file mode 100644 index 000000000..dd5808392 --- /dev/null +++ b/pythainlp/translate/core.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + + +class Translate: + def __init__(self, src_lang: str, target_lang: str) -> None: + """ + :param str src_lang: source language + :param str target_lang: target language + + **Options for source & target language** + * *th* - *en* - Thai to English + * *en* - *th* - English to Thai + * *th* - *zh* - Thai to Chinese + * *zh* - *th* - Chinese to Thai + """ + self.model = None + self.load_model(src_lang, target_lang) + def load_model(self, src_lang: str, target_lang: str): + if src_lang == "th" and target_lang == "en": + from pythainlp.translate.en_th import ThEnTranslator + self.model = ThEnTranslator() + elif src_lang == "en" and target_lang == "th": + from pythainlp.translate.en_th import EnThTranslator + self.model = EnThTranslator() + elif src_lang == "th" and target_lang == "zh": + from pythainlp.translate.zh_th import ThZhTranslator + self.model = ThZhTranslator() + elif src_lang == "zh" and target_lang == "th": + from pythainlp.translate.zh_th import ZhThTranslator + self.model = ZhThTranslator() + else: + raise ValueError("Not support language!") + def translate(self, text) -> str: + """ + Translate text + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + """ + return self.model.translate(text) diff --git a/tests/test_translate.py b/tests/test_translate.py index b9a78c352..7da13e7f5 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -7,7 +7,8 @@ ThEnTranslator, ThZhTranslator, ZhThTranslator, - download_model_all + download_model_all, + Translate ) @@ -38,3 +39,29 @@ def test_translate(self): "我爱你", ) ) + self.th_en_translator = Translate('th', 'en') + self.assertIsNotNone( + self.th_en_translator.translate( + "แมวกินปลา", + ) + ) + self.en_th_translator = Translate('en', 'th') + self.assertIsNotNone( + self.en_th_translator.translate( + "the cat eats fish.", + ) + ) + self.th_zh_translator = Translate('th', 'zh') + self.assertIsNotNone( + self.th_zh_translator.translate( + "ผมรักคุณ", + ) + ) + self.zh_th_translator = Translate('zh', 'th') + self.assertIsNotNone( + self.zh_th_translator.translate( + "我爱你", + ) + ) + with self.assertRaises(ValueError): + self.th_cat_translator = Translate('th', 'cat') From e95e75d2b803510eb8c44a53cf4d56a2b122e70f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 8 Jul 2021 00:03:19 +0700 Subject: [PATCH 5/8] Update core.py --- pythainlp/translate/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index dd5808392..27bbac581 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -11,10 +11,11 @@ def __init__(self, src_lang: str, target_lang: str) -> None: * *th* - *en* - Thai to English * *en* - *th* - English to Thai * *th* - *zh* - Thai to Chinese - * *zh* - *th* - Chinese to Thai + * *zh* - *th* - Chinese to Thai """ self.model = None self.load_model(src_lang, target_lang) + def load_model(self, src_lang: str, target_lang: str): if src_lang == "th" and target_lang == "en": from pythainlp.translate.en_th import ThEnTranslator @@ -30,6 +31,7 @@ def load_model(self, src_lang: str, target_lang: str): self.model = ZhThTranslator() else: raise ValueError("Not support language!") + def translate(self, text) -> str: """ Translate text From 4de1d9ec04ae5ae89c5e87e22132b88b15eb930b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 10 Jul 2021 13:12:12 +0700 Subject: [PATCH 6/8] Add Translate Example --- pythainlp/translate/core.py | 10 ++++++++++ pythainlp/translate/en_th.py | 4 ++++ pythainlp/translate/zh_th.py | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index 27bbac581..b6c858dc9 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -12,6 +12,16 @@ def __init__(self, src_lang: str, target_lang: str) -> None: * *en* - *th* - English to Thai * *th* - *zh* - Thai to Chinese * *zh* - *th* - Chinese to Thai + + :Example: + + Translate text from Thai to English:: + + from pythainlp.translate import Translate + th2en = Translate('th', 'en') + + th2en.translate("ฉันรักแมว") + # output: I love cat. """ self.model = None self.load_model(src_lang, target_lang) diff --git a/pythainlp/translate/en_th.py b/pythainlp/translate/en_th.py index 8144a6a4a..38492a980 100644 --- a/pythainlp/translate/en_th.py +++ b/pythainlp/translate/en_th.py @@ -81,6 +81,8 @@ def translate(self, text: str) -> str: :Example: + Translate text from English to Thai:: + from pythainlp.translate import EnThTranslator enth = EnThTranslator() @@ -130,6 +132,8 @@ def translate(self, text: str) -> str: :Example: + Translate text from Thai to English:: + from pythainlp.translate import ThEnTranslator then = ThEnTranslator() diff --git a/pythainlp/translate/zh_th.py b/pythainlp/translate/zh_th.py index 72d4b8b86..fd100a57d 100644 --- a/pythainlp/translate/zh_th.py +++ b/pythainlp/translate/zh_th.py @@ -25,6 +25,8 @@ def translate(self, text: str) -> str: :Example: + Translate text from Thai to Chinese:: + from pythainlp.translate import ThZhTranslator thzh = ThZhTranslator() @@ -58,6 +60,8 @@ def translate(self, text: str) -> str: :Example: + Translate text from Chinese to Thai:: + from pythainlp.translate import ZhThTranslator zhth = ZhThTranslator() From 5834e51b322591f84d41a7dc63508b1cf2cd360b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 10 Jul 2021 13:14:03 +0700 Subject: [PATCH 7/8] Update core.py --- pythainlp/translate/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index b6c858dc9..816d34a29 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -12,7 +12,7 @@ def __init__(self, src_lang: str, target_lang: str) -> None: * *en* - *th* - English to Thai * *th* - *zh* - Thai to Chinese * *zh* - *th* - Chinese to Thai - + :Example: Translate text from Thai to English:: From 606c3062201333455b2374d2c1a10e8e010fdb2b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 13 Jul 2021 22:06:23 +0700 Subject: [PATCH 8/8] Update Translate class docs --- pythainlp/translate/core.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index 816d34a29..48702ee43 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -2,6 +2,28 @@ class Translate: + """ + Machine Translation + + :param str src_lang: source language + :param str target_lang: target language + + **Options for source & target language** + * *th* - *en* - Thai to English + * *en* - *th* - English to Thai + * *th* - *zh* - Thai to Chinese + * *zh* - *th* - Chinese to Thai + + :Example: + + Translate text from Thai to English:: + + from pythainlp.translate import Translate + th2en = Translate('th', 'en') + + th2en.translate("ฉันรักแมว") + # output: I love cat. + """ def __init__(self, src_lang: str, target_lang: str) -> None: """ :param str src_lang: source language