diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 286146f8d..e221e2b88 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
- python-version: [3.6]
+ python-version: [3.7]
steps:
- uses: actions/checkout@v2
diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml
index 67648b72d..a2a483d8e 100644
--- a/.github/workflows/macos-test.yml
+++ b/.github/workflows/macos-test.yml
@@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [macos-latest]
- python-version: [3.6]
+ python-version: [3.7]
steps:
- uses: actions/checkout@v2
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cc7e55cfe..008aadf8f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: [3.6]
+ python-version: [3.7]
steps:
- uses: actions/checkout@v2
diff --git a/README.md b/README.md
index fa54ac344..c678514f3 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
PyThaiNLP: Thai Natural Language Processing in Python
-
+
diff --git a/README_TH.md b/README_TH.md
index facc5d699..e54a32207 100644
--- a/README_TH.md
+++ b/README_TH.md
@@ -2,7 +2,7 @@
PyThaiNLP: Thai Natural Language Processing in Python
-
+
diff --git a/docs/api/spell.rst b/docs/api/spell.rst
index 398a334c9..cad3f7faf 100644
--- a/docs/api/spell.rst
+++ b/docs/api/spell.rst
@@ -8,7 +8,9 @@ Modules
-------
.. autofunction:: correct
+.. autofunction:: correct_sent
.. autofunction:: spell
+.. autofunction:: spell_sent
.. autoclass:: NorvigSpellChecker
:special-members:
:members:
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
index 3da277037..d48354d6e 100644
--- a/docs/notes/installation.rst
+++ b/docs/notes/installation.rst
@@ -26,6 +26,7 @@ where ``extras`` can be
- ``wangchanberta`` (to support wangchanberta models)
- ``mt5`` (to mt5 models for Thai text summarizer)
- ``wordnet`` (to support wordnet)
+ - ``spell`` (to support phunspell & symspellpy)
- ``full`` (install everything)
For dependency details, look at `extras` variable in `setup.py `_.
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
index 8c6f7ddee..fd1711786 100644
--- a/pythainlp/spell/__init__.py
+++ b/pythainlp/spell/__init__.py
@@ -8,10 +8,11 @@
"correct",
"spell",
"NorvigSpellChecker",
+ "spell_sent",
+ "correct_sent"
]
from pythainlp.spell.pn import NorvigSpellChecker
-
DEFAULT_SPELL_CHECKER = NorvigSpellChecker()
-from pythainlp.spell.core import correct, spell
+from pythainlp.spell.core import correct, spell, correct_sent, spell_sent
diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py
index 9c93d58c6..a749fa61c 100644
--- a/pythainlp/spell/core.py
+++ b/pythainlp/spell/core.py
@@ -3,6 +3,7 @@
Spell checking functions
"""
+import itertools
from typing import List
from pythainlp.spell import DEFAULT_SPELL_CHECKER
@@ -19,6 +20,8 @@ def spell(word: str, engine: str = "pn") -> List[str]:
:param str word: Word to spell check
:param str engine:
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
+ * *phunspell* - A spell checker utilizing spylls a port of Hunspell.
+ * *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
:return: list of possible correct words within 1 or 2 edit distance and
sorted by frequency of word occurrences in the spelling dictionary
@@ -49,8 +52,16 @@ def spell(word: str, engine: str = "pn") -> List[str]:
spell("เหตการณ")
# output: ['เหตุการณ์']
"""
+ if engine == "phunspell":
+ from pythainlp.spell.phunspell import spell as SPELL_CHECKER
+ text_correct = SPELL_CHECKER(word)
+ elif engine == "symspellpy":
+ from pythainlp.spell.symspellpy import spell as SPELL_CHECKER
+ text_correct = SPELL_CHECKER(word)
+ else:
+ text_correct = DEFAULT_SPELL_CHECKER.spell(word)
- return DEFAULT_SPELL_CHECKER.spell(word)
+ return text_correct
def correct(word: str, engine: str = "pn") -> str:
@@ -60,7 +71,9 @@ def correct(word: str, engine: str = "pn") -> str:
:param str word: word to correct spelling
:param str engine:
- * pn - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
+ * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
+ * *phunspell* - A spell checker utilizing spylls a port of Hunspell.
+ * *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
:return: the corrected word
:rtype: str
@@ -84,5 +97,73 @@ def correct(word: str, engine: str = "pn") -> str:
correct("เหตการณ")
# output: 'เหตุการณ์'
"""
+ if engine == "phunspell":
+ from pythainlp.spell.phunspell import correct as SPELL_CHECKER
+ text_correct = SPELL_CHECKER(word)
+ elif engine == "symspellpy":
+ from pythainlp.spell.symspellpy import correct as SPELL_CHECKER
+ text_correct = SPELL_CHECKER(word)
+ else:
+ text_correct = DEFAULT_SPELL_CHECKER.correct(word)
+
+ return text_correct
+
+
+def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]:
+ """
+ Provides a list of possible correct spelling of sentence
+
+ :param List[str] list_words: list word of sentence
+ :param str engine:
+ * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
+ * *phunspell* - A spell checker utilizing spylls a port of Hunspell.
+ * *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
+ :return: list of possible correct words
+ :rtype: List[List[str]]
+
+ :Example:
+ ::
+
+ from pythainlp.spell import spell_sent
- return DEFAULT_SPELL_CHECKER.correct(word)
+ spell_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
+ # output: [['เด็ก', 'อินเทอร์เน็ต', 'แรง']]
+ """
+ if engine == "symspellpy":
+ from pythainlp.spell.symspellpy import spell_sent as symspellpy_spell
+ list_new = symspellpy_spell(list_words)
+ else:
+ _temp = list(
+ itertools.product(*[spell(i, engine=engine) for i in list_words])
+ )
+ list_new = []
+ for i in _temp:
+ _temp2 = []
+ for j in i:
+ _temp2.append(j)
+ list_new.append(_temp2)
+
+ return list_new
+
+
+def correct_sent(list_words: List[str], engine: str = "pn") -> List[str]:
+ """
+ Corrects the spelling of the given sentence by returning
+
+ :param List[str] list_words: list word of sentence
+ :param str engine:
+ * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
+ * *phunspell* - A spell checker utilizing spylls a port of Hunspell.
+ * *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
+ :return: the corrected list sentences of word
+ :rtype: List[str]
+
+ :Example:
+ ::
+
+ from pythainlp.spell import correct_sent
+
+ correct_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
+ # output: ['เด็ก', 'อินเทอร์เน็ต', 'แรง']
+ """
+ return spell_sent(list_words, engine=engine)[0]
diff --git a/pythainlp/spell/phunspell.py b/pythainlp/spell/phunspell.py
new file mode 100644
index 000000000..22f8bcc53
--- /dev/null
+++ b/pythainlp/spell/phunspell.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+Phunspell
+
+A pure Python spell checker utilizing spylls a port of Hunspell.
+
+:See Also:
+ * \
+ https://github.com/dvwright/phunspell
+"""
+from typing import List
+import phunspell
+
+pspell = phunspell.Phunspell('th_TH')
+
+
+def spell(text: str) -> List[str]:
+ return list(pspell.suggest(text))
+
+
+def correct(text: str) -> str:
+ return list(pspell.suggest(text))[0]
diff --git a/pythainlp/spell/symspellpy.py b/pythainlp/spell/symspellpy.py
new file mode 100644
index 000000000..3c9ec7137
--- /dev/null
+++ b/pythainlp/spell/symspellpy.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+symspellpy
+
+symspellpy is a Python port of SymSpell v6.5.
+We used unigram & bigram from Thai National Corpus (TNC).
+
+:See Also:
+ * \
+ https://github.com/mammothb/symspellpy
+"""
+from typing import List
+from symspellpy import SymSpell, Verbosity
+from pythainlp.corpus import get_corpus_path
+from pythainlp.corpus import path_pythainlp_corpus
+from pythainlp.tokenize import word_tokenize
+
+_UNIGRAM = "tnc_freq.txt"
+_BIGRAM = "tnc_bigram_word_freqs"
+
+sym_spell = SymSpell()
+sym_spell.load_dictionary(
+ path_pythainlp_corpus(_UNIGRAM),
+ 0,
+ 1,
+ separator='\t',
+ encoding="utf-8-sig"
+)
+sym_spell.load_bigram_dictionary(
+ get_corpus_path(_BIGRAM),
+ 0,
+ 2,
+ separator='\t',
+ encoding="utf-8-sig"
+)
+
+
+def spell(text: str, max_edit_distance: int = 2) -> List[str]:
+ return [str(i).split(',')[0] for i in list(
+ sym_spell.lookup(
+ text,
+ Verbosity.CLOSEST,
+ max_edit_distance=max_edit_distance
+ )
+ )]
+
+
+def correct(text: str, max_edit_distance: int = 1) -> str:
+ return spell(text, max_edit_distance=max_edit_distance)[0]
+
+
+def spell_sent(list_words: List[str], max_edit_distance: int = 2) -> List[str]:
+ _temp = [str(i).split(',')[0].split(' ') for i in list(
+ sym_spell.lookup_compound(
+ ' '.join(list_words),
+ split_phrase_by_space=True,
+ max_edit_distance=max_edit_distance
+ ))
+ ]
+ list_new = []
+ for i in _temp:
+ list_new.append(i)
+
+ return list_new
+
+
+def correct_sent(list_words: List[str], max_edit_distance=1) -> List[str]:
+ return [
+ i[0] for i in spell_sent(
+ list_words,
+ max_edit_distance=max_edit_distance
+ )
+ ]
diff --git a/setup.py b/setup.py
index 2d2e98302..f68bd2206 100644
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,7 @@
"mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
"wordnet": ["nltk>=3.3.*"],
"sefr_cut": ["sefr_cut"],
+ "spell": ["phunspell", "spylls", "symspellpy"],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
@@ -76,7 +77,10 @@
"ssg>=0.0.6",
"torch>=1.0.0",
"transformers>=4.6.0",
- "sefr_cut"
+ "sefr_cut",
+ "phunspell",
+ "spylls",
+ "symspellpy"
],
}
@@ -91,7 +95,7 @@
url="https://github.com/PyThaiNLP/pythainlp",
packages=find_packages(exclude=["tests", "tests.*"]),
test_suite="tests",
- python_requires=">=3.6",
+ python_requires=">=3.7",
package_data={
"pythainlp": [
"corpus/*",
diff --git a/tests/test_spell.py b/tests/test_spell.py
index af3f231d9..2183f5594 100644
--- a/tests/test_spell.py
+++ b/tests/test_spell.py
@@ -2,7 +2,13 @@
import unittest
-from pythainlp.spell import NorvigSpellChecker, correct, spell
+from pythainlp.spell import (
+ NorvigSpellChecker,
+ correct,
+ spell,
+ spell_sent,
+ correct_sent
+)
class TestSpellPackage(unittest.TestCase):
@@ -18,6 +24,22 @@ def test_spell(self):
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)
+ result = spell("เน้ร", engine="phunspell")
+ self.assertIsInstance(result, list)
+ self.assertGreater(len(result), 0)
+
+ result = spell("เกสมร์", engine="phunspell")
+ self.assertIsInstance(result, list)
+ self.assertGreater(len(result), 0)
+
+ result = spell("เน้ร", engine="symspellpy")
+ self.assertIsInstance(result, list)
+ self.assertGreater(len(result), 0)
+
+ result = spell("เกสมร์", engine="symspellpy")
+ self.assertIsInstance(result, list)
+ self.assertGreater(len(result), 0)
+
def test_word_correct(self):
self.assertEqual(correct(None), "")
self.assertEqual(correct(""), "")
@@ -30,6 +52,14 @@ def test_word_correct(self):
self.assertIsInstance(result, str)
self.assertNotEqual(result, "")
+ result = correct("ทดสอง", engine="phunspell")
+ self.assertIsInstance(result, str)
+ self.assertNotEqual(result, "")
+
+ result = correct("ทดสอง", engine="symspellpy")
+ self.assertIsInstance(result, str)
+ self.assertNotEqual(result, "")
+
def test_norvig_spell_checker(self):
checker = NorvigSpellChecker(dict_filter=None)
self.assertTrue(len(checker.dictionary()) > 0)
@@ -77,3 +107,19 @@ def test_norvig_spell_checker(self):
user_dict = [24, 6, 2475]
with self.assertRaises(TypeError):
checker = NorvigSpellChecker(custom_dict=user_dict)
+
+ def test_spell_sent(self):
+ self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"]
+ self.assertIsNotNone(spell_sent(self.spell_sent))
+ self.assertIsNotNone(spell_sent(self.spell_sent, engine="pn"))
+ self.assertIsNotNone(spell_sent(self.spell_sent, engine="phunspell"))
+ self.assertIsNotNone(spell_sent(self.spell_sent, engine="symspellpy"))
+
+ def test_correct_sent(self):
+ self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"]
+ self.assertIsNotNone(correct_sent(self.spell_sent))
+ self.assertIsNotNone(correct_sent(self.spell_sent, engine="pn"))
+ self.assertIsNotNone(correct_sent(self.spell_sent, engine="phunspell"))
+ self.assertIsNotNone(
+ correct_sent(self.spell_sent, engine="symspellpy")
+ )