Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [3.6]
python-version: [3.7]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/macos-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [macos-latest]
python-version: [3.6]
python-version: [3.7]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6]
python-version: [3.7]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<img src="https://avatars0.githubusercontent.com/u/32934255?s=200&v=4"/>
<h1>PyThaiNLP: Thai Natural Language Processing in Python</h1>
<a href="https://pypi.python.org/pypi/pythainlp"><img alt="pypi" src="https://img.shields.io/pypi/v/pythainlp.svg"/></a>
<a href="https://www.python.org/downloads/release/python-360/"><img alt="Python 3.6" src="https://img.shields.io/badge/python-3.6-blue.svg"/></a>
<a href="https://www.python.org/downloads/release/python-370/"><img alt="Python 3.7" src="https://img.shields.io/badge/python-3.7-blue.svg"/></a>
<a href="https://opensource.org/licenses/Apache-2.0"><img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg"/></a>
<a href="https://pepy.tech/project/pythainlp"><img alt="Download" src="https://pepy.tech/badge/pythainlp/month"/></a>
<a href="https://ci.appveyor.com/project/wannaphongcom/pythainlp-9y1ch"><img alt="Build status" src="https://ci.appveyor.com/api/projects/status/9g3mfcwchi8em40x?svg=true"/></a>
Expand Down
2 changes: 1 addition & 1 deletion README_TH.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<img src="https://avatars0.githubusercontent.com/u/32934255?s=200&v=4"/>
<h1>PyThaiNLP: Thai Natural Language Processing in Python</h1>
<a href="https://pypi.python.org/pypi/pythainlp"><img alt="pypi" src="https://img.shields.io/pypi/v/pythainlp.svg"/></a>
<a href="https://www.python.org/downloads/release/python-360/"><img alt="Python 3.6" src="https://img.shields.io/badge/python-3.6-blue.svg"/></a>
<a href="https://www.python.org/downloads/release/python-370/"><img alt="Python 3.7" src="https://img.shields.io/badge/python-3.7-blue.svg"/></a>
<a href="https://opensource.org/licenses/Apache-2.0"><img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg"/></a>
<a href="https://pepy.tech/project/pythainlp"><img alt="Download" src="https://pepy.tech/badge/pythainlp/month"/></a>
<a href="https://ci.appveyor.com/project/wannaphongcom/pythainlp-9y1ch"><img alt="Build status" src="https://ci.appveyor.com/api/projects/status/9g3mfcwchi8em40x?svg=true"/></a>
Expand Down
2 changes: 2 additions & 0 deletions docs/api/spell.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ Modules
-------

.. autofunction:: correct
.. autofunction:: correct_sent
.. autofunction:: spell
.. autofunction:: spell_sent
.. autoclass:: NorvigSpellChecker
:special-members:
:members:
Expand Down
1 change: 1 addition & 0 deletions docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ where ``extras`` can be
- ``wangchanberta`` (to support wangchanberta models)
- ``mt5`` (to mt5 models for Thai text summarizer)
- ``wordnet`` (to support wordnet)
- ``spell`` (to support phunspell & symspellpy)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
5 changes: 3 additions & 2 deletions pythainlp/spell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
"correct",
"spell",
"NorvigSpellChecker",
"spell_sent",
"correct_sent"
]

from pythainlp.spell.pn import NorvigSpellChecker

DEFAULT_SPELL_CHECKER = NorvigSpellChecker()

from pythainlp.spell.core import correct, spell
from pythainlp.spell.core import correct, spell, correct_sent, spell_sent
87 changes: 84 additions & 3 deletions pythainlp/spell/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Spell checking functions
"""

import itertools
from typing import List

from pythainlp.spell import DEFAULT_SPELL_CHECKER
Expand All @@ -19,6 +20,8 @@ def spell(word: str, engine: str = "pn") -> List[str]:
:param str word: Word to spell check
:param str engine:
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.

:return: list of possible correct words within 1 or 2 edit distance and
sorted by frequency of word occurrences in the spelling dictionary
Expand Down Expand Up @@ -49,8 +52,16 @@ def spell(word: str, engine: str = "pn") -> List[str]:
spell("เหตการณ")
# output: ['เหตุการณ์']
"""
if engine == "phunspell":
from pythainlp.spell.phunspell import spell as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
elif engine == "symspellpy":
from pythainlp.spell.symspellpy import spell as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
else:
text_correct = DEFAULT_SPELL_CHECKER.spell(word)

return DEFAULT_SPELL_CHECKER.spell(word)
return text_correct


def correct(word: str, engine: str = "pn") -> str:
Expand All @@ -60,7 +71,9 @@ def correct(word: str, engine: str = "pn") -> str:

:param str word: word to correct spelling
:param str engine:
* pn - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
:return: the corrected word
:rtype: str

Expand All @@ -84,5 +97,73 @@ def correct(word: str, engine: str = "pn") -> str:
correct("เหตการณ")
# output: 'เหตุการณ์'
"""
if engine == "phunspell":
from pythainlp.spell.phunspell import correct as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
elif engine == "symspellpy":
from pythainlp.spell.symspellpy import correct as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
else:
text_correct = DEFAULT_SPELL_CHECKER.correct(word)

return text_correct


def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]:
"""
Provides a list of possible correct spelling of sentence

:param List[str] list_words: list word of sentence
:param str engine:
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
:return: list of possible correct words
:rtype: List[List[str]]

:Example:
::

from pythainlp.spell import spell_sent

return DEFAULT_SPELL_CHECKER.correct(word)
spell_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
# output: [['เด็ก', 'อินเทอร์เน็ต', 'แรง']]
"""
if engine == "symspellpy":
from pythainlp.spell.symspellpy import spell_sent as symspellpy_spell
list_new = symspellpy_spell(list_words)
else:
_temp = list(
itertools.product(*[spell(i, engine=engine) for i in list_words])
)
list_new = []
for i in _temp:
_temp2 = []
for j in i:
_temp2.append(j)
list_new.append(_temp2)

return list_new


def correct_sent(list_words: List[str], engine: str = "pn") -> List[str]:
"""
Corrects the spelling of the given sentence by returning

:param List[str] list_words: list word of sentence
:param str engine:
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
:return: the corrected list sentences of word
:rtype: List[str]

:Example:
::

from pythainlp.spell import correct_sent

correct_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
# output: ['เด็ก', 'อินเทอร์เน็ต', 'แรง']
"""
return spell_sent(list_words, engine=engine)[0]
22 changes: 22 additions & 0 deletions pythainlp/spell/phunspell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
"""
Phunspell

A pure Python spell checker utilizing spylls a port of Hunspell.

:See Also:
* \
https://github.com/dvwright/phunspell
"""
from typing import List
import phunspell

pspell = phunspell.Phunspell('th_TH')


def spell(text: str) -> List[str]:
return list(pspell.suggest(text))


def correct(text: str) -> str:
return list(pspell.suggest(text))[0]
73 changes: 73 additions & 0 deletions pythainlp/spell/symspellpy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
"""
symspellpy

symspellpy is a Python port of SymSpell v6.5.
We used unigram & bigram from Thai National Corpus (TNC).

:See Also:
* \
https://github.com/mammothb/symspellpy
"""
from typing import List
from symspellpy import SymSpell, Verbosity
from pythainlp.corpus import get_corpus_path
from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.tokenize import word_tokenize

_UNIGRAM = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"

sym_spell = SymSpell()
sym_spell.load_dictionary(
path_pythainlp_corpus(_UNIGRAM),
0,
1,
separator='\t',
encoding="utf-8-sig"
)
sym_spell.load_bigram_dictionary(
get_corpus_path(_BIGRAM),
0,
2,
separator='\t',
encoding="utf-8-sig"
)


def spell(text: str, max_edit_distance: int = 2) -> List[str]:
return [str(i).split(',')[0] for i in list(
sym_spell.lookup(
text,
Verbosity.CLOSEST,
max_edit_distance=max_edit_distance
)
)]


def correct(text: str, max_edit_distance: int = 1) -> str:
return spell(text, max_edit_distance=max_edit_distance)[0]


def spell_sent(list_words: List[str], max_edit_distance: int = 2) -> List[str]:
_temp = [str(i).split(',')[0].split(' ') for i in list(
sym_spell.lookup_compound(
' '.join(list_words),
split_phrase_by_space=True,
max_edit_distance=max_edit_distance
))
]
list_new = []
for i in _temp:
list_new.append(i)

return list_new


def correct_sent(list_words: List[str], max_edit_distance=1) -> List[str]:
return [
i[0] for i in spell_sent(
list_words,
max_edit_distance=max_edit_distance
)
]
8 changes: 6 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
"wordnet": ["nltk>=3.3.*"],
"sefr_cut": ["sefr_cut"],
"spell": ["phunspell", "spylls", "symspellpy"],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand All @@ -76,7 +77,10 @@
"ssg>=0.0.6",
"torch>=1.0.0",
"transformers>=4.6.0",
"sefr_cut"
"sefr_cut",
"phunspell",
"spylls",
"symspellpy"
],
}

Expand All @@ -91,7 +95,7 @@
url="https://github.com/PyThaiNLP/pythainlp",
packages=find_packages(exclude=["tests", "tests.*"]),
test_suite="tests",
python_requires=">=3.6",
python_requires=">=3.7",
package_data={
"pythainlp": [
"corpus/*",
Expand Down
48 changes: 47 additions & 1 deletion tests/test_spell.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

import unittest

from pythainlp.spell import NorvigSpellChecker, correct, spell
from pythainlp.spell import (
NorvigSpellChecker,
correct,
spell,
spell_sent,
correct_sent
)


class TestSpellPackage(unittest.TestCase):
Expand All @@ -18,6 +24,22 @@ def test_spell(self):
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)

result = spell("เน้ร", engine="phunspell")
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)

result = spell("เกสมร์", engine="phunspell")
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)

result = spell("เน้ร", engine="symspellpy")
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)

result = spell("เกสมร์", engine="symspellpy")
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)

def test_word_correct(self):
self.assertEqual(correct(None), "")
self.assertEqual(correct(""), "")
Expand All @@ -30,6 +52,14 @@ def test_word_correct(self):
self.assertIsInstance(result, str)
self.assertNotEqual(result, "")

result = correct("ทดสอง", engine="phunspell")
self.assertIsInstance(result, str)
self.assertNotEqual(result, "")

result = correct("ทดสอง", engine="symspellpy")
self.assertIsInstance(result, str)
self.assertNotEqual(result, "")

def test_norvig_spell_checker(self):
checker = NorvigSpellChecker(dict_filter=None)
self.assertTrue(len(checker.dictionary()) > 0)
Expand Down Expand Up @@ -77,3 +107,19 @@ def test_norvig_spell_checker(self):
user_dict = [24, 6, 2475]
with self.assertRaises(TypeError):
checker = NorvigSpellChecker(custom_dict=user_dict)

def test_spell_sent(self):
self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"]
self.assertIsNotNone(spell_sent(self.spell_sent))
self.assertIsNotNone(spell_sent(self.spell_sent, engine="pn"))
self.assertIsNotNone(spell_sent(self.spell_sent, engine="phunspell"))
self.assertIsNotNone(spell_sent(self.spell_sent, engine="symspellpy"))

def test_correct_sent(self):
self.spell_sent = ["เด็", "อินอร์เน็ต", "แรง"]
self.assertIsNotNone(correct_sent(self.spell_sent))
self.assertIsNotNone(correct_sent(self.spell_sent, engine="pn"))
self.assertIsNotNone(correct_sent(self.spell_sent, engine="phunspell"))
self.assertIsNotNone(
correct_sent(self.spell_sent, engine="symspellpy")
)