|  | 
| 2 | 2 | """ | 
| 3 | 3 | Corpus related functions. | 
| 4 | 4 | """ | 
| 5 |  | - | 
| 6 |  | -import hashlib | 
| 7 | 5 | import os | 
| 8 | 6 | from typing import Union | 
| 9 |  | -from urllib.request import urlopen | 
| 10 | 7 | import json | 
| 11 | 8 | 
 | 
| 12 |  | -import requests | 
| 13 | 9 | from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path | 
| 14 | 10 | from pythainlp.tools import get_full_data_path | 
| 15 |  | -from requests.exceptions import HTTPError | 
| 16 |  | -import tarfile | 
| 17 |  | -import zipfile | 
| 18 |  | -import shutil | 
| 19 | 11 | from pythainlp import __version__ | 
| 20 | 12 | 
 | 
| 21 | 13 | 
 | 
| 22 | 14 | _CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE") | 
| 23 | 15 | 
 | 
| 24 | 16 | 
 | 
| 25 |  | -def get_corpus_db(url: str) -> requests.Response: | 
|  | 17 | +def get_corpus_db(url: str): | 
| 26 | 18 |     """ | 
| 27 | 19 |     Get corpus catalog from server. | 
| 28 | 20 | 
 | 
| 29 | 21 |     :param str url: URL corpus catalog | 
| 30 | 22 |     """ | 
|  | 23 | +    import requests | 
| 31 | 24 |     corpus_db = None | 
| 32 | 25 |     try: | 
| 33 | 26 |         corpus_db = requests.get(url) | 
| 34 |  | -    except HTTPError as http_err: | 
|  | 27 | +    except requests.exceptions.HTTPError as http_err: | 
| 35 | 28 |         print(f"HTTP error occurred: {http_err}") | 
| 36 | 29 |     except Exception as err: | 
| 37 | 30 |         print(f"Non-HTTP error occurred: {err}") | 
| @@ -231,6 +224,8 @@ def _download(url: str, dst: str) -> int: | 
| 231 | 224 |     """ | 
| 232 | 225 |     _CHUNK_SIZE = 64 * 1024  # 64 KiB | 
| 233 | 226 | 
 | 
|  | 227 | +    import requests | 
|  | 228 | +    from urllib.request import urlopen | 
| 234 | 229 |     file_size = int(urlopen(url).info().get("Content-Length", -1)) | 
| 235 | 230 |     r = requests.get(url, stream=True) | 
| 236 | 231 |     with open(get_full_data_path(dst), "wb") as f: | 
| @@ -262,6 +257,7 @@ def _check_hash(dst: str, md5: str) -> None: | 
| 262 | 257 |     @param: md5 place to hash the file (MD5) | 
| 263 | 258 |     """ | 
| 264 | 259 |     if md5 and md5 != "-": | 
|  | 260 | +        import hashlib | 
| 265 | 261 |         with open(get_full_data_path(dst), "rb") as f: | 
| 266 | 262 |             content = f.read() | 
| 267 | 263 |             file_md5 = hashlib.md5(content).hexdigest() | 
| @@ -423,13 +419,15 @@ def download( | 
| 423 | 419 |             foldername = None | 
| 424 | 420 | 
 | 
| 425 | 421 |             if corpus_versions["is_tar_gz"] == "True": | 
|  | 422 | +                import tarfile | 
| 426 | 423 |                 is_folder = True | 
| 427 | 424 |                 foldername = name+"_"+str(version) | 
| 428 | 425 |                 if not os.path.exists(get_full_data_path(foldername)): | 
| 429 | 426 |                     os.mkdir(get_full_data_path(foldername)) | 
| 430 | 427 |                 with tarfile.open(get_full_data_path(file_name)) as tar: | 
| 431 | 428 |                     tar.extractall(path=get_full_data_path(foldername)) | 
| 432 | 429 |             elif corpus_versions["is_zip"] == "True": | 
|  | 430 | +                import zipfile | 
| 433 | 431 |                 is_folder = True | 
| 434 | 432 |                 foldername = name+"_"+str(version) | 
| 435 | 433 |                 if not os.path.exists(get_full_data_path(foldername)): | 
| @@ -520,6 +518,7 @@ def remove(name: str) -> bool: | 
| 520 | 518 |     if data: | 
| 521 | 519 |         path = get_corpus_path(name) | 
| 522 | 520 |         if data[0].get("is_folder"): | 
|  | 521 | +            import shutil | 
| 523 | 522 |             os.remove(get_full_data_path(data[0].get("filename"))) | 
| 524 | 523 |             shutil.rmtree(path, ignore_errors=True) | 
| 525 | 524 |         else: | 
|  | 
0 commit comments