PyThaiNLP · wannaphong · Sep 16, 2022 · Sep 16, 2022 · Sep 16, 2022
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -148,11 +148,13 @@ def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]:
             )
 
 
-def get_corpus_path(name: str,  version: str = None) -> Union[str, None]:
+def get_corpus_path(name: str,  version: str = None, force: bool = False) -> Union[str, None]:
     """
     Get corpus path.
 
     :param str name: corpus name
+    :param str version: version
+    :param bool force: force download
     :return: path to the corpus or **None** of the corpus doesn't \
              exist in the device
     :rtype: str
@@ -202,7 +204,7 @@ def get_corpus_path(name: str,  version: str = None) -> Union[str, None]:
     corpus_db_detail = get_corpus_db_detail(name, version=version)
 
     if not corpus_db_detail or not corpus_db_detail.get("filename"):
-        download(name,  version=version)
+        download(name,  version=version, force=force)
         corpus_db_detail = get_corpus_db_detail(name, version=version)
 
     if corpus_db_detail and corpus_db_detail.get("filename"):
@@ -213,7 +215,7 @@ def get_corpus_path(name: str,  version: str = None) -> Union[str, None]:
             path = get_full_data_path(corpus_db_detail.get("filename"))
         # check if the corpus file actually exists, download if not
         if not os.path.exists(path):
-            download(name)
+            download(name,  version=version, force=force)
         if os.path.exists(path):
             return path
 

diff --git a/pythainlp/corpus/pos_lst20_perceptron-v0.2.3.json b/pythainlp/corpus/pos_lst20_perceptron-v0.2.3.json
diff --git a/pythainlp/tag/lst20_ner_onnx.py b/pythainlp/tag/lst20_ner_onnx.py
@@ -5,6 +5,11 @@
 
 class LST20_NER_ONNX(WngchanBerta_ONNX):
     def __init__(self, providers: List[str] = ['CPUExecutionProvider']) -> None:
+        print("""
+        LST20 corpus are free for research and open source only.\n
+        If you want to use in Commercial use, please contract NECTEC.\n
+        https://www.facebook.com/dancearmy/posts/10157641945708284
+        """)
         WngchanBerta_ONNX.__init__(
             self,
             model_name="onnx_lst20ner",

diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -21,7 +21,12 @@ class NER:
 
     **Options for corpus**
         * *thaimer* - Thai NER corpus
-        * *lst20* - lst20 corpus (wangchanberta only)
+        * *lst20* - lst20 corpus (wangchanberta only). \
+            `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
+            by National Electronics and Computer Technology Center, Thailand \
+            It is free for **non-commercial uses and research only**. \
+            You can read at \
+            `Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.
 
     **Note**: for tltk engine, It's support ner model from tltk only.
     """
@@ -39,6 +44,12 @@ def load_engine(self, engine: str, corpus: str) -> None:
             self.engine = LST20_NER_ONNX()
         elif engine == "wangchanberta":
             from pythainlp.wangchanberta import ThaiNameTagger
+            if corpus=="lst20":
+                print("""
+                LST20 corpus are free for research and open source only.\n
+                If you want to use in Commercial use, please contract NECTEC.\n
+                https://www.facebook.com/dancearmy/posts/10157641945708284
+                """)
             self.engine = ThaiNameTagger(dataset_name=corpus)
         elif engine == "tltk":
             from pythainlp.tag import tltk

diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -14,8 +14,7 @@
 _PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
 _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
 
-_LST20_TAGGER_NAME = "pos_lst20_perceptron-v0.2.3.json"
-_LST20_TAGGERD_PATH = os.path.join(corpus_path(), _LST20_TAGGER_NAME)
+_LST20_TAGGER_NAME = "pos_lst20_perceptron"
 
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
@@ -38,8 +37,14 @@ def _pud_tagger():
 
 def _lst20_tagger():
     global _LST20_TAGGER
+    print("""
+    LST20 corpus are free for research and open source only.\n
+    If you want to use in Commercial use, please contract NECTEC.\n
+    https://www.facebook.com/dancearmy/posts/10157641945708284
+    """)
     if not _LST20_TAGGER:
-        _LST20_TAGGER = PerceptronTagger(path=_LST20_TAGGERD_PATH)
+        path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
+        _LST20_TAGGER = PerceptronTagger(path=path)
     return _LST20_TAGGER
 
 

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -21,7 +21,10 @@ def pos_tag(
             if you choose other corpus, It's change to TNC corpus.)
     :param str corpus: the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
-            by National Electronics and Computer Technology Center, Thailand
+            by National Electronics and Computer Technology Center, Thailand \
+            It is free for **non-commercial uses and research only**. \
+            You can read at \
+            `Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.
         * *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \
             from `Universal Dependencies <https://universaldependencies.org/>`
         * *orchid* - `ORCHID \

diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -41,6 +41,11 @@ def _pud_tagger():
 
 def _lst20_tagger():
     global _LST20_TAGGER
+    print("""
+    LST20 corpus are free for research and open source only.\n
+    If you want to use in Commercial use, please contract NECTEC.\n
+    https://www.facebook.com/dancearmy/posts/10157641945708284
+    """)
     if not _LST20_TAGGER:
         path = get_corpus_path(_LST20_TAGGER_NAME)
         with open(path, encoding="utf-8-sig") as fh:

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -25,6 +25,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
     Tokenizes running word list into list of clauses (list of strings).
     split by CRF trained on LST20 Corpus.
 
+    It is free for **non-commercial uses and research only**. \
+    You can read at `Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.
+
     :param str doc: word list to be clause
     :return: list of claues
     :rtype: list[list[str]]

diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py
@@ -55,6 +55,11 @@ def _extract_features(doc):
 
 
 def segment(doc: List[str]) -> List[List[str]]:
+    print("""
+    LST20 corpus are free for research and open source only.\n
+    If you want to use in Commercial use, please contract NECTEC.\n
+    https://www.facebook.com/dancearmy/posts/10157641945708284
+    """)
     word_tags = pos_tag(doc, corpus="lst20")
     features = _extract_features(word_tags)
     word_markers = list(zip(doc, tagger.tag(features)))