Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Modules
.. autofunction:: get_corpus
.. autofunction:: get_corpus_db
.. autofunction:: get_corpus_db_detail
.. autofunction:: get_corpus_default_db
.. autofunction:: get_corpus_path
.. autofunction:: download
.. autofunction:: remove
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"get_corpus",
"get_corpus_db",
"get_corpus_db_detail",
"get_corpus_default_db",
"get_corpus_path",
"provinces",
"remove",
Expand Down Expand Up @@ -80,6 +81,7 @@ def corpus_db_path() -> str:
get_corpus,
get_corpus_db,
get_corpus_db_detail,
get_corpus_default_db,
get_corpus_path,
remove,
path_pythainlp_corpus,
Expand Down
37 changes: 35 additions & 2 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
from typing import Union
from urllib.request import urlopen
import json

import requests
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
Expand Down Expand Up @@ -101,7 +102,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
# 'หยิบยื่น\\t3',
# ...})
"""
path = os.path.join(corpus_path(), filename)
path = path_pythainlp_corpus(filename)
lines = []
with open(path, "r", encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()
Expand All @@ -113,7 +114,35 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
return frozenset(filter(None, lines))


def get_corpus_path(name: str, version : str = None) -> Union[str, None]:
def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]:
"""
Get model path from default_db.json

:param str name: corpus name
:return: path to the corpus or **None** of the corpus doesn't \
exist in the device
:rtype: str

If you want edit default_db.json, \
you can edit in pythainlp/corpus/default_db.json
"""
default_db_path = path_pythainlp_corpus("default_db.json")
with open(default_db_path, encoding="utf-8-sig") as fh:
corpus_db = json.load(fh)

if name in list(corpus_db.keys()):
if version in list(corpus_db[name]["versions"].keys()):
return path_pythainlp_corpus(
corpus_db[name]["versions"][version]["filename"]
)
elif version is None: # load latest version
version = corpus_db[name]["latest_version"]
return path_pythainlp_corpus(
corpus_db[name]["versions"][version]["filename"]
)


def get_corpus_path(name: str, version: str = None) -> Union[str, None]:
"""
Get corpus path.

Expand Down Expand Up @@ -159,6 +188,10 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]:
if name in list(_CUSTOMIZE.keys()):
return _CUSTOMIZE[name]

default_path = get_corpus_default_db(name=name, version=version)
if default_path is not None:
return default_path

# check if the corpus is in local catalog, download if not
corpus_db_detail = get_corpus_db_detail(name)

Expand Down
22 changes: 22 additions & 0 deletions pythainlp/corpus/default_db.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"thainer": {
"name": "thainer",
"latest_version": "1.5",
"description": "Thai Named Entity Recognition",
"long_description": "Thai Named Entity Recognition",
"url": "https://github.com/wannaphong/thai-ner/",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"license": "cc-by-4.0",
"versions": {
"1.5": {
"filename": "thainer_crf_1_5.model",
"download_url": "https://github.com/wannaphong/thai-ner/releases/download/1.5/thai-ner-1-5-newmm-lst20.crfsuite",
"md5": "-",
"pythainlp_version": ">=2.2.7"
}
}
}
}
Binary file added pythainlp/corpus/thainer_crf_1_5.model
Binary file not shown.
5 changes: 5 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
download,
get_corpus_db,
get_corpus_db_detail,
get_corpus_default_db,
get_corpus_path,
provinces,
remove,
Expand Down Expand Up @@ -72,6 +73,10 @@ def test_corpus(self):
) # corpus name not exist
self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists
self.assertIsNotNone(get_corpus_path("test")) # corpus exists
self.assertIsNone(get_corpus_default_db("test"))
self.assertIsNotNone(get_corpus_default_db("thainer", "1.5"))
self.assertIsNotNone(get_corpus_default_db("thainer"))
self.assertIsNone(get_corpus_default_db("thainer", "1.2"))
self.assertTrue(remove("test")) # remove existing
self.assertFalse(remove("test")) # remove non-existing
self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing
Expand Down