Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/api/translate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ Modules
:members: translate
.. autoclass:: ThEnTranslator
:members: translate
.. autoclass:: ThZhTranslator
:members: translate
.. autoclass:: ZhThTranslator
:members: translate
.. autoclass:: Translate
:members:
13 changes: 11 additions & 2 deletions pythainlp/translate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,20 @@
__all__ = [
"EnThTranslator",
"ThEnTranslator",
"download_model_all"
"download_model_all",
"ThZhTranslator",
"ZhThTranslator",
"Translate"
]

from pythainlp.translate.core import (
from pythainlp.translate.core import Translate

from pythainlp.translate.en_th import (
EnThTranslator,
ThEnTranslator,
download_model_all,
)
from pythainlp.translate.zh_th import (
ThZhTranslator,
ZhThTranslator,
)
149 changes: 55 additions & 94 deletions pythainlp/translate/core.py
Original file line number Diff line number Diff line change
@@ -1,114 +1,75 @@
# -*- coding: utf-8 -*-
import os
import tarfile
from collections import defaultdict

from pythainlp.corpus import download, get_corpus_path
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path

from fairseq.models.transformer import TransformerModel
from sacremoses import MosesTokenizer
class Translate:
"""
Machine Translation

:param str src_lang: source language
:param str target_lang: target language

_EN_TH_MODEL_NAME = "scb_1m_en-th_moses"
# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"
**Options for source & target language**
* *th* - *en* - Thai to English
* *en* - *th* - English to Thai
* *th* - *zh* - Thai to Chinese
* *zh* - *th* - Chinese to Thai

_TH_EN_MODEL_NAME = "scb_1m_th-en_spm"
# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz
_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0"
:Example:

Translate text from Thai to English::

def _get_translate_path(model: str, *path: str) -> str:
return os.path.join(get_full_data_path(model), *path)
from pythainlp.translate import Translate
th2en = Translate('th', 'en')

th2en.translate("ฉันรักแมว")
# output: I love cat.
"""
def __init__(self, src_lang: str, target_lang: str) -> None:
"""
:param str src_lang: source language
:param str target_lang: target language

def _download_install(name: str) -> None:
if get_corpus_path(name) is None:
download(name, force=True, version="1.0")
tar = tarfile.open(get_corpus_path(name), "r:gz")
tar.extractall()
tar.close()
if not os.path.exists(get_full_data_path(name)):
os.mkdir(get_full_data_path(name))
with tarfile.open(get_corpus_path(name)) as tar:
tar.extractall(path=get_full_data_path(name))
**Options for source & target language**
* *th* - *en* - Thai to English
* *en* - *th* - English to Thai
* *th* - *zh* - Thai to Chinese
* *zh* - *th* - Chinese to Thai

:Example:

def download_model_all() -> None:
"""
Download all translation models in advanced
"""
_download_install(_EN_TH_MODEL_NAME)
_download_install(_TH_EN_MODEL_NAME)


class EnThTranslator:
def __init__(self):
self._tokenizer = MosesTokenizer("en")

self._model_name = _EN_TH_MODEL_NAME

_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
self._model_name,
_EN_TH_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
self._model_name,
_EN_TH_FILE_NAME,
"vocab",
),
)

def translate(self, text: str) -> str:
"""
Translate text from English to Thai
Translate text from Thai to English::

:param str text: input text in source language
:return: translated text in target language
:rtype: str
from pythainlp.translate import Translate
th2en = Translate('th', 'en')

th2en.translate("ฉันรักแมว")
# output: I love cat.
"""
tokens = " ".join(self._tokenizer.tokenize(text))
translated = self._model.translate(tokens)
return translated.replace(" ", "").replace("▁", " ").strip()


class ThEnTranslator:
def __init__(self):
self._model_name = _TH_EN_MODEL_NAME

_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"vocab",
),
bpe="sentencepiece",
sentencepiece_model=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"bpe",
"spm.th.model",
),
)

def translate(self, text: str) -> str:
self.model = None
self.load_model(src_lang, target_lang)

def load_model(self, src_lang: str, target_lang: str):
if src_lang == "th" and target_lang == "en":
from pythainlp.translate.en_th import ThEnTranslator
self.model = ThEnTranslator()
elif src_lang == "en" and target_lang == "th":
from pythainlp.translate.en_th import EnThTranslator
self.model = EnThTranslator()
elif src_lang == "th" and target_lang == "zh":
from pythainlp.translate.zh_th import ThZhTranslator
self.model = ThZhTranslator()
elif src_lang == "zh" and target_lang == "th":
from pythainlp.translate.zh_th import ZhThTranslator
self.model = ZhThTranslator()
else:
raise ValueError("Not support language!")

def translate(self, text) -> str:
"""
Translate text from Thai to English
Translate text

:param str text: input text in source language
:return: translated text in target language
:rtype: str
"""
return self._model.translate(text)
return self.model.translate(text)
145 changes: 145 additions & 0 deletions pythainlp/translate/en_th.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
"""
English-Thai Machine Translation

from VISTEC-depa Thailand Artificial Intelligence Research Institute

Website: https://airesearch.in.th/releases/machine-translation-models/
"""
import os
import tarfile
from collections import defaultdict

from pythainlp.corpus import download, get_corpus_path
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path

from fairseq.models.transformer import TransformerModel
from sacremoses import MosesTokenizer


_EN_TH_MODEL_NAME = "scb_1m_en-th_moses"
# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"

_TH_EN_MODEL_NAME = "scb_1m_th-en_spm"
# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz
_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0"


def _get_translate_path(model: str, *path: str) -> str:
return os.path.join(get_full_data_path(model), *path)


def _download_install(name: str) -> None:
if get_corpus_path(name) is None:
download(name, force=True, version="1.0")
tar = tarfile.open(get_corpus_path(name), "r:gz")
tar.extractall()
tar.close()
if not os.path.exists(get_full_data_path(name)):
os.mkdir(get_full_data_path(name))
with tarfile.open(get_corpus_path(name)) as tar:
tar.extractall(path=get_full_data_path(name))


def download_model_all() -> None:
"""
Download all translation models in advanced
"""
_download_install(_EN_TH_MODEL_NAME)
_download_install(_TH_EN_MODEL_NAME)


class EnThTranslator:
def __init__(self):
self._tokenizer = MosesTokenizer("en")

self._model_name = _EN_TH_MODEL_NAME

_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
self._model_name,
_EN_TH_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
self._model_name,
_EN_TH_FILE_NAME,
"vocab",
),
)

def translate(self, text: str) -> str:
"""
Translate text from English to Thai

:param str text: input text in source language
:return: translated text in target language
:rtype: str

:Example:

Translate text from English to Thai::

from pythainlp.translate import EnThTranslator

enth = EnThTranslator()

enth.translate("I love cat.")
# output: ฉันรักแมว

"""
tokens = " ".join(self._tokenizer.tokenize(text))
translated = self._model.translate(tokens)
return translated.replace(" ", "").replace("▁", " ").strip()


class ThEnTranslator:
def __init__(self):
self._model_name = _TH_EN_MODEL_NAME

_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"vocab",
),
bpe="sentencepiece",
sentencepiece_model=_get_translate_path(
self._model_name,
_TH_EN_FILE_NAME,
"bpe",
"spm.th.model",
),
)

def translate(self, text: str) -> str:
"""
Translate text from Thai to English

:param str text: input text in source language
:return: translated text in target language
:rtype: str

:Example:

Translate text from Thai to English::

from pythainlp.translate import ThEnTranslator

then = ThEnTranslator()

then.translate("ฉันรักแมว")
# output: I love cat.

"""
return self._model.translate(text)
Loading