diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 286146f8d..e221e2b88 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.6] + python-version: [3.7] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 67648b72d..a2a483d8e 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: os: [macos-latest] - python-version: [3.6] + python-version: [3.7] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cc7e55cfe..008aadf8f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6] + python-version: [3.7] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index fa54ac344..c678514f3 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@

PyThaiNLP: Thai Natural Language Processing in Python

pypi - Python 3.6 + Python 3.7 License Download Build status diff --git a/README_TH.md b/README_TH.md index facc5d699..e54a32207 100644 --- a/README_TH.md +++ b/README_TH.md @@ -2,7 +2,7 @@

PyThaiNLP: Thai Natural Language Processing in Python

pypi - Python 3.6 + Python 3.7 License Download Build status diff --git a/docs/api/spell.rst b/docs/api/spell.rst index 398a334c9..cad3f7faf 100644 --- a/docs/api/spell.rst +++ b/docs/api/spell.rst @@ -8,7 +8,9 @@ Modules ------- .. autofunction:: correct +.. autofunction:: correct_sent .. autofunction:: spell +.. autofunction:: spell_sent .. autoclass:: NorvigSpellChecker :special-members: :members: diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index 3da277037..d48354d6e 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -26,6 +26,7 @@ where ``extras`` can be - ``wangchanberta`` (to support wangchanberta models) - ``mt5`` (to mt5 models for Thai text summarizer) - ``wordnet`` (to support wordnet) + - ``spell`` (to support phunspell & symspellpy) - ``full`` (install everything) For dependency details, look at `extras` variable in `setup.py `_. diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index 8c6f7ddee..fd1711786 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -8,10 +8,11 @@ "correct", "spell", "NorvigSpellChecker", + "spell_sent", + "correct_sent" ] from pythainlp.spell.pn import NorvigSpellChecker - DEFAULT_SPELL_CHECKER = NorvigSpellChecker() -from pythainlp.spell.core import correct, spell +from pythainlp.spell.core import correct, spell, correct_sent, spell_sent diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py index 9c93d58c6..a749fa61c 100644 --- a/pythainlp/spell/core.py +++ b/pythainlp/spell/core.py @@ -3,6 +3,7 @@ Spell checking functions """ +import itertools from typing import List from pythainlp.spell import DEFAULT_SPELL_CHECKER @@ -19,6 +20,8 @@ def spell(word: str, engine: str = "pn") -> List[str]: :param str word: Word to spell check :param str engine: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. :return: list of possible correct words within 1 or 2 edit distance and sorted by frequency of word occurrences in the spelling dictionary @@ -49,8 +52,16 @@ def spell(word: str, engine: str = "pn") -> List[str]: spell("เหตการณ") # output: ['เหตุการณ์'] """ + if engine == "phunspell": + from pythainlp.spell.phunspell import spell as SPELL_CHECKER + text_correct = SPELL_CHECKER(word) + elif engine == "symspellpy": + from pythainlp.spell.symspellpy import spell as SPELL_CHECKER + text_correct = SPELL_CHECKER(word) + else: + text_correct = DEFAULT_SPELL_CHECKER.spell(word) - return DEFAULT_SPELL_CHECKER.spell(word) + return text_correct def correct(word: str, engine: str = "pn") -> str: @@ -60,7 +71,9 @@ def correct(word: str, engine: str = "pn") -> str: :param str word: word to correct spelling :param str engine: - * pn - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. :return: the corrected word :rtype: str @@ -84,5 +97,73 @@ def correct(word: str, engine: str = "pn") -> str: correct("เหตการณ") # output: 'เหตุการณ์' """ + if engine == "phunspell": + from pythainlp.spell.phunspell import correct as SPELL_CHECKER + text_correct = SPELL_CHECKER(word) + elif engine == "symspellpy": + from pythainlp.spell.symspellpy import correct as SPELL_CHECKER + text_correct = SPELL_CHECKER(word) + else: + text_correct = DEFAULT_SPELL_CHECKER.correct(word) + + return text_correct + + +def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]: + """ + Provides a list of possible correct spelling of sentence + + :param List[str] list_words: list word of sentence + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + :return: list of possible correct words + :rtype: List[List[str]] + + :Example: + :: + + from pythainlp.spell import spell_sent - return DEFAULT_SPELL_CHECKER.correct(word) + spell_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy') + # output: [['เด็ก', 'อินเทอร์เน็ต', 'แรง']] + """ + if engine == "symspellpy": + from pythainlp.spell.symspellpy import spell_sent as symspellpy_spell + list_new = symspellpy_spell(list_words) + else: + _temp = list( + itertools.product(*[spell(i, engine=engine) for i in list_words]) + ) + list_new = [] + for i in _temp: + _temp2 = [] + for j in i: + _temp2.append(j) + list_new.append(_temp2) + + return list_new + + +def correct_sent(list_words: List[str], engine: str = "pn") -> List[str]: + """ + Corrects the spelling of the given sentence by returning + + :param List[str] list_words: list word of sentence + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + :return: the corrected list sentences of word + :rtype: List[str] + + :Example: + :: + + from pythainlp.spell import correct_sent + + correct_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy') + # output: ['เด็ก', 'อินเทอร์เน็ต', 'แรง'] + """ + return spell_sent(list_words, engine=engine)[0] diff --git a/pythainlp/spell/phunspell.py b/pythainlp/spell/phunspell.py new file mode 100644 index 000000000..22f8bcc53 --- /dev/null +++ b/pythainlp/spell/phunspell.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +""" +Phunspell + +A pure Python spell checker utilizing spylls a port of Hunspell. + +:See Also: + * \ + https://github.com/dvwright/phunspell +""" +from typing import List +import phunspell + +pspell = phunspell.Phunspell('th_TH') + + +def spell(text: str) -> List[str]: + return list(pspell.suggest(text)) + + +def correct(text: str) -> str: + return list(pspell.suggest(text))[0] diff --git a/pythainlp/spell/symspellpy.py b/pythainlp/spell/symspellpy.py new file mode 100644 index 000000000..3c9ec7137 --- /dev/null +++ b/pythainlp/spell/symspellpy.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +symspellpy + +symspellpy is a Python port of SymSpell v6.5. +We used unigram & bigram from Thai National Corpus (TNC). + +:See Also: + * \ + https://github.com/mammothb/symspellpy +""" +from typing import List +from symspellpy import SymSpell, Verbosity +from pythainlp.corpus import get_corpus_path +from pythainlp.corpus import path_pythainlp_corpus +from pythainlp.tokenize import word_tokenize + +_UNIGRAM = "tnc_freq.txt" +_BIGRAM = "tnc_bigram_word_freqs" + +sym_spell = SymSpell() +sym_spell.load_dictionary( + path_pythainlp_corpus(_UNIGRAM), + 0, + 1, + separator='\t', + encoding="utf-8-sig" +) +sym_spell.load_bigram_dictionary( + get_corpus_path(_BIGRAM), + 0, + 2, + separator='\t', + encoding="utf-8-sig" +) + + +def spell(text: str, max_edit_distance: int = 2) -> List[str]: + return [str(i).split(',')[0] for i in list( + sym_spell.lookup( + text, + Verbosity.CLOSEST, + max_edit_distance=max_edit_distance + ) + )] + + +def correct(text: str, max_edit_distance: int = 1) -> str: + return spell(text, max_edit_distance=max_edit_distance)[0] + + +def spell_sent(list_words: List[str], max_edit_distance: int = 2) -> List[str]: + _temp = [str(i).split(',')[0].split(' ') for i in list( + sym_spell.lookup_compound( + ' '.join(list_words), + split_phrase_by_space=True, + max_edit_distance=max_edit_distance + )) + ] + list_new = [] + for i in _temp: + list_new.append(i) + + return list_new + + +def correct_sent(list_words: List[str], max_edit_distance=1) -> List[str]: + return [ + i[0] for i in spell_sent( + list_words, + max_edit_distance=max_edit_distance + ) + ] diff --git a/setup.py b/setup.py index 2d2e98302..f68bd2206 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,7 @@ "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "sefr_cut": ["sefr_cut"], + "spell": ["phunspell", "spylls", "symspellpy"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -76,7 +77,10 @@ "ssg>=0.0.6", "torch>=1.0.0", "transformers>=4.6.0", - "sefr_cut" + "sefr_cut", + "phunspell", + "spylls", + "symspellpy" ], } @@ -91,7 +95,7 @@ url="https://github.com/PyThaiNLP/pythainlp", packages=find_packages(exclude=["tests", "tests.*"]), test_suite="tests", - python_requires=">=3.6", + python_requires=">=3.7", package_data={ "pythainlp": [ "corpus/*", diff --git a/tests/test_spell.py b/tests/test_spell.py index af3f231d9..2183f5594 100644 --- a/tests/test_spell.py +++ b/tests/test_spell.py @@ -2,7 +2,13 @@ import unittest -from pythainlp.spell import NorvigSpellChecker, correct, spell +from pythainlp.spell import ( + NorvigSpellChecker, + correct, + spell, + spell_sent, + correct_sent +) class TestSpellPackage(unittest.TestCase): @@ -18,6 +24,22 @@ def test_spell(self): self.assertIsInstance(result, list) self.assertGreater(len(result), 0) + result = spell("เน้ร", engine="phunspell") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + + result = spell("เกสมร์", engine="phunspell") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + + result = spell("เน้ร", engine="symspellpy") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + + result = spell("เกสมร์", engine="symspellpy") + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + def test_word_correct(self): self.assertEqual(correct(None), "") self.assertEqual(correct(""), "") @@ -30,6 +52,14 @@ def test_word_correct(self): self.assertIsInstance(result, str) self.assertNotEqual(result, "") + result = correct("ทดสอง", engine="phunspell") + self.assertIsInstance(result, str) + self.assertNotEqual(result, "") + + result = correct("ทดสอง", engine="symspellpy") + self.assertIsInstance(result, str) + self.assertNotEqual(result, "") + def test_norvig_spell_checker(self): checker = NorvigSpellChecker(dict_filter=None) self.assertTrue(len(checker.dictionary()) > 0) @@ -77,3 +107,19 @@ def test_norvig_spell_checker(self): user_dict = [24, 6, 2475] with self.assertRaises(TypeError): checker = NorvigSpellChecker(custom_dict=user_dict) + + def test_spell_sent(self): + self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"] + self.assertIsNotNone(spell_sent(self.spell_sent)) + self.assertIsNotNone(spell_sent(self.spell_sent, engine="pn")) + self.assertIsNotNone(spell_sent(self.spell_sent, engine="phunspell")) + self.assertIsNotNone(spell_sent(self.spell_sent, engine="symspellpy")) + + def test_correct_sent(self): + self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"] + self.assertIsNotNone(correct_sent(self.spell_sent)) + self.assertIsNotNone(correct_sent(self.spell_sent, engine="pn")) + self.assertIsNotNone(correct_sent(self.spell_sent, engine="phunspell")) + self.assertIsNotNone( + correct_sent(self.spell_sent, engine="symspellpy") + )