Merge pull request #753 from PyThaiNLP/improve-1

wannaphong · web-flow · commit af7910a24224 · 2022-10-31T00:32:24.000+07:00
Add Reduce import time #719 to PyThaiNLP 3.1.1
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil
 PyThaiNLP เป็นไลบารีภาษาไพทอนสำหรับประมวลผลภาษาธรรมชาติ คล้ายกับ NLTK โดยเน้นภาษาไทย [ดูรายละเอียดภาษาไทยได้ที่ README_TH.MD](https://github.com/PyThaiNLP/pythainlp/blob/dev/README_TH.md)
 
 **News**
+> PyThaiNLP join Hacktoberfest 2022!! https://github.com/PyThaiNLP/pythainlp/issues/717
 
 > Now, You can contact or ask any questions with the PyThaiNLP team. <a href="https://matrix.to/#/#thainlp:matrix.org" rel="noopener" target="_blank"><img src="https://matrix.to/img/matrix-badge.svg" alt="Chat on Matrix"></a>
 
diff --git a/docs/api/tools.rst b/docs/api/tools.rst
@@ -10,4 +10,4 @@ Modules
 .. autofunction:: get_full_data_path
 .. autofunction:: get_pythainlp_data_path
 .. autofunction:: get_pythainlp_path
-.. autofunction:: misspell
+.. autofunction:: pythainlp.tools.misspell.misspell
diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from pythainlp.augment.word2vec.core import Word2VecAug
-from bpemb import BPEmb
 from typing import List, Tuple
 
 
@@ -12,6 +11,7 @@ class BPEmbAug:
     `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
     """
     def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
+        from bpemb import BPEmb
         self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
         self.model = self.bpemb_temp.emb
         self.load_w2v()
diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from typing import List, Tuple
-import gensim.models.keyedvectors as word2vec
 import itertools
 
 
@@ -13,6 +12,7 @@ def __init__(
         :param object tokenize: tokenize function
         :param str type: moodel type (file, binary)
         """
+        import gensim.models.keyedvectors as word2vec
         self.tokenizer = tokenize
         if type == "file":
             self.model = word2vec.KeyedVectors.load_word2vec_format(model)
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -2,36 +2,29 @@
 """
 Corpus related functions.
 """
-
-import hashlib
 import os
 from typing import Union
-from urllib.request import urlopen
 import json
 
-import requests
 from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
 from pythainlp.tools import get_full_data_path
-from requests.exceptions import HTTPError
-import tarfile
-import zipfile
-import shutil
 from pythainlp import __version__
 
 
 _CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE")
 
 
-def get_corpus_db(url: str) -> requests.Response:
+def get_corpus_db(url: str):
     """
     Get corpus catalog from server.
 
     :param str url: URL corpus catalog
     """
+    import requests
     corpus_db = None
     try:
         corpus_db = requests.get(url)
-    except HTTPError as http_err:
+    except requests.exceptions.HTTPError as http_err:
         print(f"HTTP error occurred: {http_err}")
     except Exception as err:
         print(f"Non-HTTP error occurred: {err}")
@@ -231,6 +224,8 @@ def _download(url: str, dst: str) -> int:
     """
     _CHUNK_SIZE = 64 * 1024  # 64 KiB
 
+    import requests
+    from urllib.request import urlopen
     file_size = int(urlopen(url).info().get("Content-Length", -1))
     r = requests.get(url, stream=True)
     with open(get_full_data_path(dst), "wb") as f:
@@ -262,6 +257,7 @@ def _check_hash(dst: str, md5: str) -> None:
     @param: md5 place to hash the file (MD5)
     """
     if md5 and md5 != "-":
+        import hashlib
         with open(get_full_data_path(dst), "rb") as f:
             content = f.read()
             file_md5 = hashlib.md5(content).hexdigest()
@@ -423,13 +419,15 @@ def download(
             foldername = None
 
             if corpus_versions["is_tar_gz"] == "True":
+                import tarfile
                 is_folder = True
                 foldername = name+"_"+str(version)
                 if not os.path.exists(get_full_data_path(foldername)):
                     os.mkdir(get_full_data_path(foldername))
                 with tarfile.open(get_full_data_path(file_name)) as tar:
                     tar.extractall(path=get_full_data_path(foldername))
             elif corpus_versions["is_zip"] == "True":
+                import zipfile
                 is_folder = True
                 foldername = name+"_"+str(version)
                 if not os.path.exists(get_full_data_path(foldername)):
@@ -520,6 +518,7 @@ def remove(name: str) -> bool:
     if data:
         path = get_corpus_path(name)
         if data[0].get("is_folder"):
+            import shutil
             os.remove(get_full_data_path(data[0].get("filename")))
             shutil.rmtree(path, ignore_errors=True)
         else:
diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py
@@ -14,11 +14,7 @@
 
 This tagger is provided under the terms of the MIT License.
 """
-
-from __future__ import absolute_import
-
 import json
-import random
 from collections import defaultdict
 from typing import Dict, Iterable, List, Tuple, Union
 
@@ -160,6 +156,7 @@ def train(
             location.
         :param nr_iter: Number of training iterations.
         """
+        import random
         self._make_tagdict(sentences)
         self.model.classes = self.classes
         for _ in range(nr_iter):
diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py
@@ -7,7 +7,6 @@
 
 from typing import Dict, List, Tuple, Union
 
-from pycrfsuite import Tagger as CRFTagger
 from pythainlp.corpus import get_corpus_path, thai_stopwords
 from pythainlp.tag import pos_tag
 from pythainlp.tokenize import word_tokenize
@@ -98,6 +97,7 @@ def __init__(self, version: str = "1.5") -> None:
                             It's support Thai NER 1.4 & 1.5.
                             The defualt value is `1.5`
         """
+        from pycrfsuite import Tagger as CRFTagger
         self.crf = CRFTagger()
 
         if version == "1.4":
diff --git a/pythainlp/tag/wangchanberta_onnx.py b/pythainlp/tag/wangchanberta_onnx.py
@@ -1,16 +1,17 @@
 # -*- coding: utf-8 -*-
 from typing import List
 import json
-import sentencepiece as spm
+
 import numpy as np
-from onnxruntime import (
-    InferenceSession, SessionOptions, GraphOptimizationLevel
-)
 from pythainlp.corpus import get_path_folder_corpus
 
 
 class WngchanBerta_ONNX:
     def __init__(self, model_name: str, model_version: str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None:
+        import sentencepiece as spm
+        from onnxruntime import (
+            InferenceSession, SessionOptions, GraphOptimizationLevel
+        )
         self.model_name = model_name
         self.model_version = model_version
         self.options = SessionOptions()
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -4,7 +4,6 @@
 """
 import re
 from typing import Iterable, List, Union
-import warnings
 
 from pythainlp.tokenize import (
     DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -14,7 +13,6 @@
     DEFAULT_WORD_DICT_TRIE,
     DEFAULT_WORD_TOKENIZE_ENGINE,
 )
-from pythainlp import thai_characters
 from pythainlp.util.trie import Trie, dict_trie
 
 
@@ -63,6 +61,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
     _list_all = []
     if isinstance(segments[0], str):
         segments = [segments]
+    from pythainlp import thai_characters
     for i, s in enumerate(segments):
         _list_sents = []
         _add_index = []
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -4,7 +4,6 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
-    "misspell",
 ]
 
 from pythainlp.tools.path import (
@@ -13,5 +12,3 @@
     get_pythainlp_data_path,
     get_pythainlp_path,
 )
-
-from pythainlp.tools.misspell import misspell
diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
@@ -1,5 +1,6 @@
-import numpy as np
+# -*- coding: utf-8 -*-
 from typing import List
+import numpy as np
 
 THAI_CHARACTERS_WITHOUT_SHIFT = [
     "ผปแอิืทมใฝ",
@@ -113,7 +114,7 @@ def misspell(sentence: str, ratio: float = 0.05):
     :Example:
     ::
 
-        from pythainlp.tools import misspell
+        from pythainlp.tools.misspell import misspell
 
         sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"
 
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
@@ -4,7 +4,6 @@
 """
 import re
 from typing import List, Union
-import warnings
 
 from pythainlp import thai_above_vowels as above_v
 from pythainlp import thai_below_vowels as below_v
diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py
@@ -4,7 +4,6 @@
 
 Convert time string or time object to Thai words.
 """
-import warnings
 from datetime import datetime, time
 from typing import Union
 
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
@@ -4,7 +4,6 @@
 import warnings
 from transformers import (
     CamembertTokenizer,
-    AutoTokenizer,
     pipeline,
 )
 
diff --git a/tests/test_misspell.py b/tests/test_misspell.py
@@ -2,7 +2,7 @@
 
 import unittest
 import numpy as np
-from pythainlp.tools import misspell
+from pythainlp.tools.misspell import misspell
 
 
 def _count_difference(st1, st2):

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,6 @@`
`4`	`4`	`"get_full_data_path",`
`5`	`5`	`"get_pythainlp_data_path",`
`6`	`6`	`"get_pythainlp_path",`
`7`		`- "misspell",`
`8`	`7`	`]`
`9`	`8`
`10`	`9`	`from pythainlp.tools.path import (`
`@@ -13,5 +12,3 @@`
`13`	`12`	`get_pythainlp_data_path,`
`14`	`13`	`get_pythainlp_path,`
`15`	`14`	`)`
`16`		`-`
`17`		`-from pythainlp.tools.misspell import misspell`