PyThaiNLP · wannaphong · Oct 9, 2021 · Oct 1, 2021 · Oct 2, 2021 · Oct 2, 2021
diff --git a/docs/api/tools.rst b/docs/api/tools.rst
@@ -10,3 +10,4 @@ Modules
 .. autofunction:: get_full_data_path
 .. autofunction:: get_pythainlp_data_path
 .. autofunction:: get_pythainlp_path
+.. autofunction:: misspell
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -4,6 +4,7 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
+    "misspell",
 ]
 
 from pythainlp.tools.path import (
@@ -12,3 +13,5 @@
     get_pythainlp_data_path,
     get_pythainlp_path,
 )
+
+from pythainlp.tools.misspell import misspell
diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
@@ -0,0 +1,141 @@
+import numpy as np
+from typing import List
+
+THAI_CHARACTERS_WITHOUT_SHIFT = [
+    "ผปแอิืทมใฝ",
+    "ฟหกดเ้่าสวง",
+    "ๆไำพะัีรนยบลฃ",
+    "ๅ/_ภถุึคตจขช",
+]
+
+THAI_CHARACTERS_WITH_SHIFT = [
+    "()ฉฮฺ์?ฒฬฦ",
+    "ฤฆฏโฌ็๋ษศซ.",
+    '๐"ฎฑธํ๊ณฯญฐ,',
+    "+๑๒๓๔ู฿๕๖๗๘๙",
+]
+
+ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
+    "1234567890-=",
+    "qwertyuiop[]\\",
+    "asdfghjkl;'",
+    "zxcvbnm,./",
+]
+
+ENGLISH_CHARACTERS_WITH_SHIFT = [
+    "!@#$%^&*()_+",
+    "QWERTYUIOP{}|",
+    'ASDFGHJKL:"',
+    "ZXCVBNM<>?",
+]
+
+
+ALL_CHARACTERS = [
+    THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
+    ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
+]
+
+
+def search_location_of_character(char: str):
+    for language_ix in [0, 1]:
+        for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
+            if char in row:
+                return (language_ix, ix // 4, ix % 4, row.index(char))
+
+
+def find_neighbour_locations(
+    loc: tuple,
+    char: str,
+    kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
+):
+    language_ix, is_shift, row, pos = loc
+
+    valid_neighbours = []
+    for kr, ks in kernel:
+        _row, _pos = row + kr, pos + ks
+        if 0 <= _row <= 3 and 0 <= _pos <= len(
+            ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
+        ):
+            valid_neighbours.append((language_ix, is_shift, _row, _pos, char))
+
+    return valid_neighbours
+
+
+def find_misspell_candidates(char: str, verbose: bool = False):
+    loc = search_location_of_character(char)
+    if loc is None:
+        return None
+
+    valid_neighbours = find_neighbour_locations(loc, char)
+
+    chars = []
+    printing_locations = ["▐"] * 3 + [char] + ["▐"] * 3
+
+    for language_ix, is_shift, row, pos, char in valid_neighbours:
+        try:
+            char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
+            chars.append(char)
+            kernel = (row - loc[1], pos - loc[2])
+
+            if kernel == (-1, -1):
+                ix = 5
+            elif kernel == (-1, 0):
+                ix = 6
+            elif kernel[0] == 0:
+                ix = 3 + kernel[1]
+            elif kernel == (1, 0):
+                ix = 0
+            elif kernel == (1, 1):
+                ix = 1
+            else:
+                continue
+            printing_locations[ix] = char
+        except IndexError as e:
+            continue
+        except Exception as e:
+            print("Something wrong with: ", char)
+            raise e
+
+    return chars
+
+
+def misspell(sentence: str, ratio: float = 0.05):
+    """
+    Simulate some mispellings for the input sentence.
+    The number of mispelled locations is governed by ratio.
+
+    :params str sentence: sentence to be mispelled
+    :params float ratio: number of misspells per 100 chars. Defaults to 0.5.
+
+    :return: sentence containing some misspelled
+    :rtype: str
+
+    :Example:
+        ::
+            from pythainlp.tools import misspell
+
+            sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"
+
+            misspell(sent, ratio=0.1)
+            # output:
+            ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727
+    """
+    num_misspells = np.floor(len(sentence) * ratio).astype(int)
+    positions = np.random.choice(
+        len(sentence),
+        size=num_misspells,
+        replace=False
+    )
+
+    # convert strings to array of characters
+    misspelled = list(sentence)
+    for pos in positions:
+        potential_candidates = find_misspell_candidates(sentence[pos])
+        if potential_candidates is None:
+            continue
+
+        candidate = np.random.choice(potential_candidates)
+
+        misspelled[pos] = candidate
+
+    return "".join(misspelled)
diff --git a/tests/test_misspell.py b/tests/test_misspell.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import numpy as np
+from pythainlp.tools import misspell
+
+
+def _count_difference(st1, st2):
+    # this assumes len(st1) == len(st2)
+
+    count = 0
+    for i in range(len(st1)):
+        if st1[i] != st2[i]:
+            count += 1
+
+    return count
+
+
+class TestTextMisspellPackage(unittest.TestCase):
+    def setUp(self):
+        self.texts = [
+            "เรารักคุณมากที่สุดในโลก",
+            "เราอยู่ที่มหาวิทยาลัยขอนแก่น"
+        ]
+
+    def test_misspell_naive(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.1)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertGreater(diff, 0, "we have some misspells.")
+
+    def test_misspell_with_ratio_0_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.0)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertEqual(
+                diff, 0,
+                "we shouldn't have any  misspell with ratio=0."
+            )
+
+    def test_misspell_with_ratio_50_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.5)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertLessEqual(
+                np.abs(diff - 0.5 * len(text)),
+                2,
+                f"expect 0.5*len(text)±2 misspells with ratio=0.5. (Δ={diff})",
+            )
+
+    def test_misspell_with_ratio_100_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=1)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertLessEqual(
+                np.abs(diff - len(text)),
+                2,
+                f"expect len(text)-2 misspells with ratio=1.5. (Δ={diff})",
+            )