|
14 | 14 | from pythainlp.tools import get_full_data_path |
15 | 15 | from requests.exceptions import HTTPError |
16 | 16 | from tinydb import Query, TinyDB |
| 17 | +import tarfile |
| 18 | +import zipfile |
| 19 | +import shutil |
17 | 20 | from pythainlp import __version__ |
18 | 21 |
|
19 | 22 |
|
@@ -207,7 +210,10 @@ def get_corpus_path(name: str, version: str = None) -> Union[str, None]: |
207 | 210 |
|
208 | 211 | if corpus_db_detail and corpus_db_detail.get("filename"): |
209 | 212 | # corpus is in the local catalog, get full path to the file |
210 | | - path = get_full_data_path(corpus_db_detail.get("filename")) |
| 213 | + if corpus_db_detail.get("is_folder"): |
| 214 | + path = get_full_data_path(corpus_db_detail.get("foldername")) |
| 215 | + else: |
| 216 | + path = get_full_data_path(corpus_db_detail.get("filename")) |
211 | 217 | # check if the corpus file actually exists, download if not |
212 | 218 | if not os.path.exists(path): |
213 | 219 | download(name) |
@@ -411,11 +417,45 @@ def download( |
411 | 417 | file_name, corpus_versions["md5"], |
412 | 418 | ) |
413 | 419 |
|
| 420 | + is_folder = False |
| 421 | + foldername = None |
| 422 | + |
| 423 | + if corpus_versions["is_tar_gz"] == "True": |
| 424 | + is_folder = True |
| 425 | + foldername = name+"_"+str(version) |
| 426 | + if not os.path.exists(get_full_data_path(foldername)): |
| 427 | + os.mkdir(get_full_data_path(foldername)) |
| 428 | + with tarfile.open(get_full_data_path(file_name)) as tar: |
| 429 | + tar.extractall(path=get_full_data_path(foldername)) |
| 430 | + elif corpus_versions["is_zip"] == "True": |
| 431 | + is_folder = True |
| 432 | + foldername = name+"_"+str(version) |
| 433 | + if not os.path.exists(get_full_data_path(foldername)): |
| 434 | + os.mkdir(get_full_data_path(foldername)) |
| 435 | + with zipfile.ZipFile( |
| 436 | + get_full_data_path(file_name), 'r' |
| 437 | + ) as zip: |
| 438 | + zip.extractall(path=get_full_data_path(foldername)) |
| 439 | + |
414 | 440 | if found: |
415 | | - local_db.update({"version": version}, query.name == name) |
| 441 | + local_db.update( |
| 442 | + { |
| 443 | + "version": version, |
| 444 | + "filename": file_name, |
| 445 | + "is_folder": is_folder, |
| 446 | + "foldername": foldername |
| 447 | + }, |
| 448 | + query.name == name |
| 449 | + ) |
416 | 450 | else: |
417 | 451 | local_db.insert( |
418 | | - {"name": name, "version": version, "filename": file_name} |
| 452 | + { |
| 453 | + "name": name, |
| 454 | + "version": version, |
| 455 | + "filename": file_name, |
| 456 | + "is_folder": is_folder, |
| 457 | + "foldername": foldername |
| 458 | + } |
419 | 459 | ) |
420 | 460 | else: |
421 | 461 | if local_db.search( |
@@ -471,7 +511,11 @@ def remove(name: str) -> bool: |
471 | 511 |
|
472 | 512 | if data: |
473 | 513 | path = get_corpus_path(name) |
474 | | - os.remove(path) |
| 514 | + if data[0].get("is_folder"): |
| 515 | + os.remove(get_full_data_path(data[0].get("filename"))) |
| 516 | + shutil.rmtree(path, ignore_errors=True) |
| 517 | + else: |
| 518 | + os.remove(path) |
475 | 519 | db.remove(query.name == name) |
476 | 520 | db.close() |
477 | 521 | return True |
|
0 commit comments