From b39fe4911fae13219aa0fb7163b990dc0734de7e Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:21:35 +0700 Subject: [PATCH 1/9] Reduce import time (pythainlp.corpus.core) --- pythainlp/corpus/core.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index c18d94b24..09a31a035 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -2,20 +2,13 @@ """ Corpus related functions. """ - -import hashlib import os from typing import Union -from urllib.request import urlopen import json import requests from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path from pythainlp.tools import get_full_data_path -from requests.exceptions import HTTPError -import tarfile -import zipfile -import shutil from pythainlp import __version__ @@ -31,7 +24,7 @@ def get_corpus_db(url: str) -> requests.Response: corpus_db = None try: corpus_db = requests.get(url) - except HTTPError as http_err: + except requests.exceptions.HTTPError as http_err: print(f"HTTP error occurred: {http_err}") except Exception as err: print(f"Non-HTTP error occurred: {err}") @@ -231,6 +224,7 @@ def _download(url: str, dst: str) -> int: """ _CHUNK_SIZE = 64 * 1024 # 64 KiB + from urllib.request import urlopen file_size = int(urlopen(url).info().get("Content-Length", -1)) r = requests.get(url, stream=True) with open(get_full_data_path(dst), "wb") as f: @@ -262,6 +256,7 @@ def _check_hash(dst: str, md5: str) -> None: @param: md5 place to hash the file (MD5) """ if md5 and md5 != "-": + import hashlib with open(get_full_data_path(dst), "rb") as f: content = f.read() file_md5 = hashlib.md5(content).hexdigest() @@ -423,6 +418,7 @@ def download( foldername = None if corpus_versions["is_tar_gz"] == "True": + import tarfile is_folder = True foldername = name+"_"+str(version) if not os.path.exists(get_full_data_path(foldername)): @@ -430,6 +426,7 @@ def download( with tarfile.open(get_full_data_path(file_name)) as tar: tar.extractall(path=get_full_data_path(foldername)) elif corpus_versions["is_zip"] == "True": + import zipfile is_folder = True foldername = name+"_"+str(version) if not os.path.exists(get_full_data_path(foldername)): @@ -520,6 +517,7 @@ def remove(name: str) -> bool: if data: path = get_corpus_path(name) if data[0].get("is_folder"): + import shutil os.remove(get_full_data_path(data[0].get("filename"))) shutil.rmtree(path, ignore_errors=True) else: From b00749fcce4a1341fff0b35842b74cfc70a79f2d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:22:07 +0700 Subject: [PATCH 2/9] Hide import pycrfsuite (pythainlp.tag.thainer) --- pythainlp/tag/thainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py index 6e15eb8e5..54dfc727b 100644 --- a/pythainlp/tag/thainer.py +++ b/pythainlp/tag/thainer.py @@ -7,7 +7,6 @@ from typing import Dict, List, Tuple, Union -from pycrfsuite import Tagger as CRFTagger from pythainlp.corpus import get_corpus_path, thai_stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize @@ -98,6 +97,7 @@ def __init__(self, version: str = "1.5") -> None: It's support Thai NER 1.4 & 1.5. The defualt value is `1.5` """ + from pycrfsuite import Tagger as CRFTagger self.crf = CRFTagger() if version == "1.4": From c5bf1d02c9db4adb00dad2d8bd37c1303f846837 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:22:59 +0700 Subject: [PATCH 3/9] Reduce import time (pythainlp/augment.word2vec.bpemb_wv) --- pythainlp/augment/word2vec/bpemb_wv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py index 15f6d2cca..235905c2f 100644 --- a/pythainlp/augment/word2vec/bpemb_wv.py +++ b/pythainlp/augment/word2vec/bpemb_wv.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from pythainlp.augment.word2vec.core import Word2VecAug -from bpemb import BPEmb from typing import List, Tuple @@ -12,6 +11,7 @@ class BPEmbAug: `github.com/bheinzerling/bpemb `_ """ def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300): + from bpemb import BPEmb self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs) self.model = self.bpemb_temp.emb self.load_w2v() From 37fad39a6c509627bfd67bf5cf808282e234bbd3 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:23:42 +0700 Subject: [PATCH 4/9] Reduce import time (pythainlp.augment.word2vec.core) --- pythainlp/augment/word2vec/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py index c98bcd39b..c295fa2c9 100644 --- a/pythainlp/augment/word2vec/core.py +++ b/pythainlp/augment/word2vec/core.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from typing import List, Tuple -import gensim.models.keyedvectors as word2vec import itertools @@ -13,6 +12,7 @@ def __init__( :param object tokenize: tokenize function :param str type: moodel type (file, binary) """ + import gensim.models.keyedvectors as word2vec self.tokenizer = tokenize if type == "file": self.model = word2vec.KeyedVectors.load_word2vec_format(model) From fcb5c8ccb9b484444d806cdbb2de01d8f9a5796f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:24:14 +0700 Subject: [PATCH 5/9] Reduce import time (pythainlp.tag.wangchanberta_onnx) --- pythainlp/tag/wangchanberta_onnx.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pythainlp/tag/wangchanberta_onnx.py b/pythainlp/tag/wangchanberta_onnx.py index ae619918d..8a4e6b354 100644 --- a/pythainlp/tag/wangchanberta_onnx.py +++ b/pythainlp/tag/wangchanberta_onnx.py @@ -1,16 +1,17 @@ # -*- coding: utf-8 -*- from typing import List import json -import sentencepiece as spm + import numpy as np -from onnxruntime import ( - InferenceSession, SessionOptions, GraphOptimizationLevel -) from pythainlp.corpus import get_path_folder_corpus class WngchanBerta_ONNX: def __init__(self, model_name: str, model_version: str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None: + import sentencepiece as spm + from onnxruntime import ( + InferenceSession, SessionOptions, GraphOptimizationLevel + ) self.model_name = model_name self.model_version = model_version self.options = SessionOptions() From 39c51fc85c3d19968e8675d6fdb7325fc5142157 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:30:31 +0700 Subject: [PATCH 6/9] Reduce import time (pythainlp.corpus.core) Move requests to the function that needs requests --- pythainlp/corpus/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 09a31a035..5c115f04b 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -6,7 +6,6 @@ from typing import Union import json -import requests from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path from pythainlp.tools import get_full_data_path from pythainlp import __version__ @@ -15,12 +14,13 @@ _CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE") -def get_corpus_db(url: str) -> requests.Response: +def get_corpus_db(url: str): """ Get corpus catalog from server. :param str url: URL corpus catalog """ + import requests corpus_db = None try: corpus_db = requests.get(url) @@ -224,6 +224,7 @@ def _download(url: str, dst: str) -> int: """ _CHUNK_SIZE = 64 * 1024 # 64 KiB + import requests from urllib.request import urlopen file_size = int(urlopen(url).info().get("Content-Length", -1)) r = requests.get(url, stream=True) From 01959659421c81ee8aef7e4259e373360e282c10 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 22:46:33 +0700 Subject: [PATCH 7/9] Reduce import time (pythainlp.tag._tag_perceptron) --- pythainlp/tag/_tag_perceptron.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index fadc8704b..62cdf9d88 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -14,11 +14,7 @@ This tagger is provided under the terms of the MIT License. """ - -from __future__ import absolute_import - import json -import random from collections import defaultdict from typing import Dict, Iterable, List, Tuple, Union @@ -160,6 +156,7 @@ def train( location. :param nr_iter: Number of training iterations. """ + import random self._make_tagdict(sentences) self.model.classes = self.classes for _ in range(nr_iter): From effe3b7543bd46b93cb449e222d835c381f0c48a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 8 Oct 2022 23:04:33 +0700 Subject: [PATCH 8/9] Reduce import time (pythainlp.tokenize.core) --- pythainlp/tokenize/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 4201ff735..6b918be12 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -4,7 +4,6 @@ """ import re from typing import Iterable, List, Union -import warnings from pythainlp.tokenize import ( DEFAULT_SENT_TOKENIZE_ENGINE, @@ -14,7 +13,6 @@ DEFAULT_WORD_DICT_TRIE, DEFAULT_WORD_TOKENIZE_ENGINE, ) -from pythainlp import thai_characters from pythainlp.util.trie import Trie, dict_trie @@ -63,6 +61,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = " _list_all = [] if isinstance(segments[0], str): segments = [segments] + from pythainlp import thai_characters for i, s in enumerate(segments): _list_sents = [] _add_index = [] From f3bc46e5fedada2110ae186453be9287fae16cba Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 9 Oct 2022 00:12:16 +0700 Subject: [PATCH 9/9] Delete unused import --- pythainlp/util/normalize.py | 1 - pythainlp/util/time.py | 1 - pythainlp/wangchanberta/postag.py | 1 - 3 files changed, 3 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index def68bc09..b9873e0ff 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -4,7 +4,6 @@ """ import re from typing import List, Union -import warnings from pythainlp import thai_above_vowels as above_v from pythainlp import thai_below_vowels as below_v diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py index b62c9c039..5240a157d 100644 --- a/pythainlp/util/time.py +++ b/pythainlp/util/time.py @@ -4,7 +4,6 @@ Convert time string or time object to Thai words. """ -import warnings from datetime import datetime, time from typing import Union diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 6d34d34c1..9a5846660 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -4,7 +4,6 @@ import warnings from transformers import ( CamembertTokenizer, - AutoTokenizer, pipeline, )