Skip to content

Commit be186a6

Browse files
authored
Merge pull request #657 from PyThaiNLP/dev
Update from dev
2 parents 51c4659 + 647c31b commit be186a6

File tree

3 files changed

+55
-12
lines changed

3 files changed

+55
-12
lines changed

pythainlp/corpus/core.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
from pythainlp.tools import get_full_data_path
1515
from requests.exceptions import HTTPError
1616
from tinydb import Query, TinyDB
17+
import tarfile
18+
import zipfile
19+
import shutil
1720
from pythainlp import __version__
1821

1922

@@ -207,7 +210,10 @@ def get_corpus_path(name: str, version: str = None) -> Union[str, None]:
207210

208211
if corpus_db_detail and corpus_db_detail.get("filename"):
209212
# corpus is in the local catalog, get full path to the file
210-
path = get_full_data_path(corpus_db_detail.get("filename"))
213+
if corpus_db_detail.get("is_folder"):
214+
path = get_full_data_path(corpus_db_detail.get("foldername"))
215+
else:
216+
path = get_full_data_path(corpus_db_detail.get("filename"))
211217
# check if the corpus file actually exists, download if not
212218
if not os.path.exists(path):
213219
download(name)
@@ -411,11 +417,45 @@ def download(
411417
file_name, corpus_versions["md5"],
412418
)
413419

420+
is_folder = False
421+
foldername = None
422+
423+
if corpus_versions["is_tar_gz"] == "True":
424+
is_folder = True
425+
foldername = name+"_"+str(version)
426+
if not os.path.exists(get_full_data_path(foldername)):
427+
os.mkdir(get_full_data_path(foldername))
428+
with tarfile.open(get_full_data_path(file_name)) as tar:
429+
tar.extractall(path=get_full_data_path(foldername))
430+
elif corpus_versions["is_zip"] == "True":
431+
is_folder = True
432+
foldername = name+"_"+str(version)
433+
if not os.path.exists(get_full_data_path(foldername)):
434+
os.mkdir(get_full_data_path(foldername))
435+
with zipfile.ZipFile(
436+
get_full_data_path(file_name), 'r'
437+
) as zip:
438+
zip.extractall(path=get_full_data_path(foldername))
439+
414440
if found:
415-
local_db.update({"version": version}, query.name == name)
441+
local_db.update(
442+
{
443+
"version": version,
444+
"filename": file_name,
445+
"is_folder": is_folder,
446+
"foldername": foldername
447+
},
448+
query.name == name
449+
)
416450
else:
417451
local_db.insert(
418-
{"name": name, "version": version, "filename": file_name}
452+
{
453+
"name": name,
454+
"version": version,
455+
"filename": file_name,
456+
"is_folder": is_folder,
457+
"foldername": foldername
458+
}
419459
)
420460
else:
421461
if local_db.search(
@@ -471,7 +511,11 @@ def remove(name: str) -> bool:
471511

472512
if data:
473513
path = get_corpus_path(name)
474-
os.remove(path)
514+
if data[0].get("is_folder"):
515+
os.remove(get_full_data_path(data[0].get("filename")))
516+
shutil.rmtree(path, ignore_errors=True)
517+
else:
518+
os.remove(path)
475519
db.remove(query.name == name)
476520
db.close()
477521
return True

pythainlp/translate/en_th.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,12 @@
2727

2828

2929
def _get_translate_path(model: str, *path: str) -> str:
30-
return os.path.join(get_full_data_path(model), *path)
30+
return os.path.join(get_corpus_path(model, version="1.0"), *path)
3131

3232

3333
def _download_install(name: str) -> None:
3434
if get_corpus_path(name) is None:
3535
download(name, force=True, version="1.0")
36-
tar = tarfile.open(get_corpus_path(name, version="1.0"), "r:gz")
37-
tar.extractall()
38-
tar.close()
39-
if not os.path.exists(get_full_data_path(name)):
40-
os.mkdir(get_full_data_path(name))
41-
with tarfile.open(get_corpus_path(name)) as tar:
42-
tar.extractall(path=get_full_data_path(name))
4336

4437

4538
def download_model_all() -> None:

tests/test_corpus.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pythainlp.corpus.util import revise_newmm_default_wordset
2929
from requests import Response
3030
import nltk
31+
import os
3132

3233

3334
class TestCorpusPackage(unittest.TestCase):
@@ -161,3 +162,8 @@ def test_revise_wordset(self):
161162
["ที่", "ถูก", "สังหาร", "เมื่อ", "ปี", " ", "พ.ศ.", " ", "2492"],
162163
]
163164
self.assertIsInstance(revise_newmm_default_wordset(training_data), set)
165+
166+
def test_zip(self):
167+
_p = get_corpus_path("test_zip")
168+
self.assertEqual(os.path.isdir(_p), True)
169+
self.assertEqual(remove("test_zip"), True)

0 commit comments

Comments
 (0)