Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1341a31
Merge pull request #2 from pavaris-pm/improve-pos-tag-transformers
pavaris-pm Nov 14, 2023
38f71b5
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 1, 2023
41d79c2
add phayathaibert core engine
pavaris-pm Dec 1, 2023
cb9e27a
add data augmentation engine
pavaris-pm Dec 4, 2023
473af52
update engine properties
pavaris-pm Dec 4, 2023
0c3efd0
updae augmentation properties
pavaris-pm Dec 4, 2023
245e99e
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 4, 2023
dd2b834
change license
pavaris-pm Dec 4, 2023
d1b9c99
add er engine
pavaris-pm Dec 4, 2023
cbb7c8e
Update __init__.py
bact Dec 4, 2023
b71ebda
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 10, 2023
348dc1f
add documentation and credit model builder
pavaris-pm Dec 10, 2023
c7b6900
Merge branch 'dev' into dev
pavaris-pm Dec 10, 2023
a55168a
update pep8
pavaris-pm Dec 10, 2023
536f493
resolve conflict
pavaris-pm Dec 10, 2023
76b49c3
update pep8
pavaris-pm Dec 10, 2023
22daf2d
update pep8
pavaris-pm Dec 10, 2023
84de5c4
Update core.py: sort imports, remove duplicated lines
bact Dec 10, 2023
a2fd4d3
Update phayathaibert.py: sort imports, remove duplicated lines
bact Dec 10, 2023
7e24d3f
Reexport NamedEntityTagger
bact Dec 10, 2023
826cfed
Fix minor types
bact Dec 10, 2023
72e2bd5
Update __init__.py
bact Dec 10, 2023
dec62c1
Use MAX_NUM_AUGS constant for max num_augs limit
bact Dec 11, 2023
9999f90
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 11, 2023
e7ef6ce
Update phayathaibert.py
bact Dec 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pythainlp/augment/lm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
LM
Language Models
"""

__all__ = [
"FastTextAug",
"Thai2transformersAug",
"ThaiTextAugmenter",
]

from pythainlp.augment.lm.fasttext import FastTextAug
from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
94 changes: 94 additions & 0 deletions pythainlp/augment/lm/phayathaibert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

from typing import List
import random
import re

from pythainlp.phayathaibert.core import ThaiTextProcessor


_MODEL_NAME = "clicknext/phayathaibert"


class ThaiTextAugmenter:
def __init__(self,) -> None:
from transformers import (AutoTokenizer,
AutoModelForMaskedLM,
pipeline,)
self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_MODEL_NAME)
self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm)
self.processor = ThaiTextProcessor()

def generate(self,
sample_text: str,
word_rank: int,
max_length: int = 3,
sample: bool = False
) -> str:
sample_txt = sample_text
final_text = ""

for j in range(max_length):
input = self.processor.preprocess(sample_txt)
if sample:
random_word_idx = random.randint(0, 4)
output = self.model(input)[random_word_idx]["sequence"]
else:
output = self.model(input)[word_rank]["sequence"]
sample_txt = output + "<mask>"
final_text = sample_txt

gen_txt = re.sub("<mask>", "", final_text)

return gen_txt

def augment(self,
text: str,
num_augs: int = 3,
sample: bool = False
) -> List[str]:
"""
Text augmentation from PhayaThaiBERT

:param str text: Thai text
:param int num_augs: an amount of augmentation text needed as an output
:param bool sample: whether to sample the text as an output or not, \
true if more word diversity is needed

:return: list of text augment
:rtype: List[str]

:Example:
::

from pythainlp.augment.lm import ThaiTextAugmenter

aug = ThaiTextAugmenter()
aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)

# output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
"""
MAX_NUM_AUGS = 5
augment_list = []

if "<mask>" not in text:
text = text + "<mask>"

if num_augs <= MAX_NUM_AUGS:
for rank in range(num_augs):
gen_text = self.generate(text, rank, sample=sample)
processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
augment_list.append(processed_text)

return augment_list

raise ValueError(
f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}"
)
18 changes: 18 additions & 0 deletions pythainlp/phayathaibert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
__all__ = [
"NamedEntityTagger",
"PartOfSpeechTagger",
"ThaiTextAugmenter",
"ThaiTextProcessor",
"segment",
]

from pythainlp.phayathaibert.core import (
NamedEntityTagger,
PartOfSpeechTagger,
ThaiTextAugmenter,
ThaiTextProcessor,
segment,
)
Loading