Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,13 @@ def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]:
)


def get_corpus_path(name: str, version: str = None) -> Union[str, None]:
def get_corpus_path(name: str, version: str = None, force: bool = False) -> Union[str, None]:
"""
Get corpus path.

:param str name: corpus name
:param str version: version
:param bool force: force download
:return: path to the corpus or **None** of the corpus doesn't \
exist in the device
:rtype: str
Expand Down Expand Up @@ -202,7 +204,7 @@ def get_corpus_path(name: str, version: str = None) -> Union[str, None]:
corpus_db_detail = get_corpus_db_detail(name, version=version)

if not corpus_db_detail or not corpus_db_detail.get("filename"):
download(name, version=version)
download(name, version=version, force=force)
corpus_db_detail = get_corpus_db_detail(name, version=version)

if corpus_db_detail and corpus_db_detail.get("filename"):
Expand All @@ -213,7 +215,7 @@ def get_corpus_path(name: str, version: str = None) -> Union[str, None]:
path = get_full_data_path(corpus_db_detail.get("filename"))
# check if the corpus file actually exists, download if not
if not os.path.exists(path):
download(name)
download(name, version=version, force=force)
if os.path.exists(path):
return path

Expand Down
1 change: 0 additions & 1 deletion pythainlp/corpus/pos_lst20_perceptron-v0.2.3.json

This file was deleted.

5 changes: 5 additions & 0 deletions pythainlp/tag/lst20_ner_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

class LST20_NER_ONNX(WngchanBerta_ONNX):
def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None:
print("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
WngchanBerta_ONNX.__init__(
self,
model_name="onnx_lst20ner",
Expand Down
13 changes: 12 additions & 1 deletion pythainlp/tag/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@ class NER:

**Options for corpus**
* *thaimer* - Thai NER corpus
* *lst20* - lst20 corpus (wangchanberta only)
* *lst20* - lst20 corpus (wangchanberta only). \
`LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
by National Electronics and Computer Technology Center, Thailand \
It is free for **non-commercial uses and research only**. \
You can read at \
`Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.

**Note**: for tltk engine, It's support ner model from tltk only.
"""
Expand All @@ -39,6 +44,12 @@ def load_engine(self, engine: str, corpus: str) -> None:
self.engine = LST20_NER_ONNX()
elif engine == "wangchanberta":
from pythainlp.wangchanberta import ThaiNameTagger
if corpus=="lst20":
print("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
self.engine = ThaiNameTagger(dataset_name=corpus)
elif engine == "tltk":
from pythainlp.tag import tltk
Expand Down
11 changes: 8 additions & 3 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
_PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_LST20_TAGGER_NAME = "pos_lst20_perceptron-v0.2.3.json"
_LST20_TAGGERD_PATH = os.path.join(corpus_path(), _LST20_TAGGER_NAME)
_LST20_TAGGER_NAME = "pos_lst20_perceptron"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
Expand All @@ -38,8 +37,14 @@ def _pud_tagger():

def _lst20_tagger():
global _LST20_TAGGER
print("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
if not _LST20_TAGGER:
_LST20_TAGGER = PerceptronTagger(path=_LST20_TAGGERD_PATH)
path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
_LST20_TAGGER = PerceptronTagger(path=path)
return _LST20_TAGGER


Expand Down
5 changes: 4 additions & 1 deletion pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ def pos_tag(
if you choose other corpus, It's change to TNC corpus.)
:param str corpus: the corpus that used to create the language model for tagger
* *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
by National Electronics and Computer Technology Center, Thailand
by National Electronics and Computer Technology Center, Thailand \
It is free for **non-commercial uses and research only**. \
You can read at \
`Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.
* *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \
from `Universal Dependencies <https://universaldependencies.org/>`
* *orchid* - `ORCHID \
Expand Down
5 changes: 5 additions & 0 deletions pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def _pud_tagger():

def _lst20_tagger():
global _LST20_TAGGER
print("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
if not _LST20_TAGGER:
path = get_corpus_path(_LST20_TAGGER_NAME)
with open(path, encoding="utf-8-sig") as fh:
Expand Down
3 changes: 3 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
Tokenizes running word list into list of clauses (list of strings).
split by CRF trained on LST20 Corpus.

It is free for **non-commercial uses and research only**. \
You can read at `Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.

:param str doc: word list to be clause
:return: list of claues
:rtype: list[list[str]]
Expand Down
5 changes: 5 additions & 0 deletions pythainlp/tokenize/crfcls.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def _extract_features(doc):


def segment(doc: List[str]) -> List[List[str]]:
print("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
word_tags = pos_tag(doc, corpus="lst20")
features = _extract_features(word_tags)
word_markers = list(zip(doc, tagger.tag(features)))
Expand Down