Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/tools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Modules
.. autofunction:: get_full_data_path
.. autofunction:: get_pythainlp_data_path
.. autofunction:: get_pythainlp_path
.. autofunction:: misspell
3 changes: 3 additions & 0 deletions pythainlp/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"get_full_data_path",
"get_pythainlp_data_path",
"get_pythainlp_path",
"misspell",
]

from pythainlp.tools.path import (
Expand All @@ -12,3 +13,5 @@
get_pythainlp_data_path,
get_pythainlp_path,
)

from pythainlp.tools.misspell import misspell
141 changes: 141 additions & 0 deletions pythainlp/tools/misspell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import numpy as np
from typing import List

THAI_CHARACTERS_WITHOUT_SHIFT = [
"ผปแอิืทมใฝ",
"ฟหกดเ้่าสวง",
"ๆไำพะัีรนยบลฃ",
"ๅ/_ภถุึคตจขช",
]

THAI_CHARACTERS_WITH_SHIFT = [
"()ฉฮฺ์?ฒฬฦ",
"ฤฆฏโฌ็๋ษศซ.",
'๐"ฎฑธํ๊ณฯญฐ,',
"+๑๒๓๔ู฿๕๖๗๘๙",
]

ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
"1234567890-=",
"qwertyuiop[]\\",
"asdfghjkl;'",
"zxcvbnm,./",
]

ENGLISH_CHARACTERS_WITH_SHIFT = [
"!@#$%^&*()_+",
"QWERTYUIOP{}|",
'ASDFGHJKL:"',
"ZXCVBNM<>?",
]


ALL_CHARACTERS = [
THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
]


def search_location_of_character(char: str):
for language_ix in [0, 1]:
for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
if char in row:
return (language_ix, ix // 4, ix % 4, row.index(char))


def find_neighbour_locations(
loc: tuple,
char: str,
kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
):
language_ix, is_shift, row, pos = loc

valid_neighbours = []
for kr, ks in kernel:
_row, _pos = row + kr, pos + ks
if 0 <= _row <= 3 and 0 <= _pos <= len(
ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
):
valid_neighbours.append((language_ix, is_shift, _row, _pos, char))

return valid_neighbours


def find_misspell_candidates(char: str, verbose: bool = False):
loc = search_location_of_character(char)
if loc is None:
return None

valid_neighbours = find_neighbour_locations(loc, char)

chars = []
printing_locations = ["▐"] * 3 + [char] + ["​▐"] * 3

for language_ix, is_shift, row, pos, char in valid_neighbours:
try:
char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
chars.append(char)
kernel = (row - loc[1], pos - loc[2])

if kernel == (-1, -1):
ix = 5
elif kernel == (-1, 0):
ix = 6
elif kernel[0] == 0:
ix = 3 + kernel[1]
elif kernel == (1, 0):
ix = 0
elif kernel == (1, 1):
ix = 1
else:
continue
printing_locations[ix] = char
except IndexError as e:
continue
except Exception as e:
print("Something wrong with: ", char)
raise e

return chars


def misspell(sentence: str, ratio: float = 0.05):
"""
Simulate some mispellings for the input sentence.
The number of mispelled locations is governed by ratio.

:params str sentence: sentence to be mispelled
:params float ratio: number of misspells per 100 chars. Defaults to 0.5.

:return: sentence containing some misspelled
:rtype: str

:Example:
::
from pythainlp.tools import misspell

sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"

misspell(sent, ratio=0.1)
# output:
ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727
"""
num_misspells = np.floor(len(sentence) * ratio).astype(int)
positions = np.random.choice(
len(sentence),
size=num_misspells,
replace=False
)

# convert strings to array of characters
misspelled = list(sentence)
for pos in positions:
potential_candidates = find_misspell_candidates(sentence[pos])
if potential_candidates is None:
continue

candidate = np.random.choice(potential_candidates)

misspelled[pos] = candidate

return "".join(misspelled)
75 changes: 75 additions & 0 deletions tests/test_misspell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-

import unittest
import numpy as np
from pythainlp.tools import misspell


def _count_difference(st1, st2):
# this assumes len(st1) == len(st2)

count = 0
for i in range(len(st1)):
if st1[i] != st2[i]:
count += 1

return count


class TestTextMisspellPackage(unittest.TestCase):
def setUp(self):
self.texts = [
"เรารักคุณมากที่สุดในโลก",
"เราอยู่ที่มหาวิทยาลัยขอนแก่น"
]

def test_misspell_naive(self):
for text in self.texts:
misspelled = misspell(text, ratio=0.1)

self.assertEqual(len(text), len(misspelled))

diff = _count_difference(text, misspelled)

self.assertGreater(diff, 0, "we have some misspells.")

def test_misspell_with_ratio_0_percent(self):
for text in self.texts:
misspelled = misspell(text, ratio=0.0)

self.assertEqual(len(text), len(misspelled))

diff = _count_difference(text, misspelled)

self.assertEqual(
diff, 0,
"we shouldn't have any misspell with ratio=0."
)

def test_misspell_with_ratio_50_percent(self):
for text in self.texts:
misspelled = misspell(text, ratio=0.5)

self.assertEqual(len(text), len(misspelled))

diff = _count_difference(text, misspelled)

self.assertLessEqual(
np.abs(diff - 0.5 * len(text)),
2,
f"expect 0.5*len(text)±2 misspells with ratio=0.5. (Δ={diff})",
)

def test_misspell_with_ratio_100_percent(self):
for text in self.texts:
misspelled = misspell(text, ratio=1)

self.assertEqual(len(text), len(misspelled))

diff = _count_difference(text, misspelled)

self.assertLessEqual(
np.abs(diff - len(text)),
2,
f"expect len(text)-2 misspells with ratio=1.5. (Δ={diff})",
)