Skip to content

Commit 9bd951b

Browse files
authored
Merge pull request #882 from bact/rename-corpus-volubilis-wikipedia
Rename corpus function names for consistency / Fix types
2 parents 297aadc + 94ec1fc commit 9bd951b

File tree

10 files changed

+148
-106
lines changed

10 files changed

+148
-106
lines changed

docs/api/corpus.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ get_corpus
1717
.. autofunction:: get_corpus
1818
:noindex:
1919

20+
get_corpus_as_is
21+
~~~~~~~~~~
22+
.. autofunction:: get_corpus_as_is
23+
:noindex:
24+
2025
get_corpus_db
2126
~~~~~~~~~~~~~~
2227
.. autofunction:: get_corpus_db
@@ -77,9 +82,9 @@ thai_orst_words
7782
.. autofunction:: thai_orst_words
7883
:noindex:
7984

80-
thai_synonym
85+
thai_synonyms
8186
~~~~~~~~~~~~~~
82-
.. autofunction:: thai_synonym
87+
.. autofunction:: thai_synonyms
8388
:noindex:
8489

8590
thai_syllables

pythainlp/corpus/__init__.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"countries",
1616
"download",
1717
"get_corpus",
18+
"get_corpus_as_is",
1819
"get_corpus_db",
1920
"get_corpus_db_detail",
2021
"get_corpus_default_db",
@@ -33,10 +34,11 @@
3334
"thai_stopwords",
3435
"thai_syllables",
3536
"thai_synonym",
37+
"thai_synonyms",
38+
"thai_volubilis_words",
39+
"thai_wikipedia_titles",
3640
"thai_words",
3741
"thai_wsd_dict",
38-
"volubilis",
39-
"wikipedia_titles",
4042
]
4143

4244
import os
@@ -88,6 +90,7 @@ def corpus_db_path() -> str:
8890
from pythainlp.corpus.core import (
8991
download,
9092
get_corpus,
93+
get_corpus_as_is,
9194
get_corpus_db,
9295
get_corpus_db_detail,
9396
get_corpus_default_db,
@@ -108,9 +111,10 @@ def corpus_db_path() -> str:
108111
thai_stopwords,
109112
thai_syllables,
110113
thai_synonym,
114+
thai_synonyms,
111115
thai_words,
112116
thai_wsd_dict,
113117
)
114118
from pythainlp.corpus.icu import thai_icu_words
115-
from pythainlp.corpus.volubilis import volubilis
116-
from pythainlp.corpus.wikipedia_titles import wikipedia_titles
119+
from pythainlp.corpus.volubilis import thai_volubilis_words
120+
from pythainlp.corpus.wikipedia import thai_wikipedia_titles

pythainlp/corpus/common.py

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,49 +12,51 @@
1212
"thai_female_names",
1313
"thai_male_names",
1414
"thai_negations",
15+
"thai_dict",
1516
"thai_stopwords",
1617
"thai_syllables",
18+
"thai_synonym",
19+
"thai_synonyms",
1720
"thai_words",
18-
"thai_dict",
1921
"thai_wsd_dict",
20-
"thai_synonym",
2122
]
2223

2324
from typing import FrozenSet, List, Union
25+
import warnings
2426

25-
from pythainlp.corpus import get_corpus, get_corpus_path
27+
from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path
2628

27-
_THAI_COUNTRIES = set()
29+
_THAI_COUNTRIES: FrozenSet[str] = frozenset()
2830
_THAI_COUNTRIES_FILENAME = "countries_th.txt"
2931

30-
_THAI_THAILAND_PROVINCES = set()
31-
_THAI_THAILAND_PROVINCES_DETAILS = []
32+
_THAI_THAILAND_PROVINCES: FrozenSet[str] = frozenset()
33+
_THAI_THAILAND_PROVINCES_DETAILS: List[dict] = []
3234
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"
3335

34-
_THAI_SYLLABLES = set()
36+
_THAI_SYLLABLES: FrozenSet[str] = frozenset()
3537
_THAI_SYLLABLES_FILENAME = "syllables_th.txt"
3638

37-
_THAI_WORDS = set()
39+
_THAI_WORDS: FrozenSet[str] = frozenset()
3840
_THAI_WORDS_FILENAME = "words_th.txt"
3941

40-
_THAI_STOPWORDS = set()
42+
_THAI_STOPWORDS: FrozenSet[str] = frozenset()
4143
_THAI_STOPWORDS_FILENAME = "stopwords_th.txt"
4244

43-
_THAI_NEGATIONS = set()
45+
_THAI_NEGATIONS: FrozenSet[str] = frozenset()
4446
_THAI_NEGATIONS_FILENAME = "negations_th.txt"
4547

46-
_THAI_FAMLIY_NAMES = set()
48+
_THAI_FAMLIY_NAMES: FrozenSet[str] = frozenset()
4749
_THAI_FAMLIY_NAMES_FILENAME = "family_names_th.txt"
48-
_THAI_FEMALE_NAMES = set()
50+
_THAI_FEMALE_NAMES: FrozenSet[str] = frozenset()
4951
_THAI_FEMALE_NAMES_FILENAME = "person_names_female_th.txt"
50-
_THAI_MALE_NAMES = set()
52+
_THAI_MALE_NAMES: FrozenSet[str] = frozenset()
5153
_THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"
5254

53-
_THAI_ORST_WORDS = set()
55+
_THAI_ORST_WORDS: FrozenSet[str] = frozenset()
5456

5557
_THAI_DICT = {}
5658
_THAI_WSD_DICT = {}
57-
_THAI_SYNONYM = None
59+
_THAI_SYNONYMS = {}
5860

5961

6062
def countries() -> FrozenSet[str]:
@@ -74,7 +76,7 @@ def countries() -> FrozenSet[str]:
7476
return _THAI_COUNTRIES
7577

7678

77-
def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
79+
def provinces(details: bool = False) -> Union[FrozenSet[str], List[dict]]:
7880
"""
7981
Return a frozenset of Thailand province names in Thai such as "กระบี่",
8082
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
@@ -96,7 +98,7 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
9698
provs = set()
9799
prov_details = []
98100

99-
for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
101+
for line in get_corpus_as_is(_THAI_THAILAND_PROVINCES_FILENAME):
100102
p = line.split(",")
101103

102104
prov = {}
@@ -155,14 +157,14 @@ def thai_orst_words() -> FrozenSet[str]:
155157
"""
156158
Return a frozenset of Thai words from Royal Society of Thailand
157159
\n(See: `dev/pythainlp/corpus/thai_orst_words.txt\
158-
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thai_orst_words>`_)
160+
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/orst_words_th.txt>`_)
159161
160162
:return: :class:`frozenset` containing words in the Thai language.
161163
:rtype: :class:`frozenset`
162164
"""
163165
global _THAI_ORST_WORDS
164166
if not _THAI_ORST_WORDS:
165-
_THAI_ORST_WORDS = get_corpus("thai_orst_words.txt")
167+
_THAI_ORST_WORDS = get_corpus("orst_words_th.txt")
166168

167169
return _THAI_ORST_WORDS
168170

@@ -266,8 +268,11 @@ def thai_dict() -> dict:
266268
global _THAI_DICT
267269
if not _THAI_DICT:
268270
import csv
269-
_THAI_DICT = {"word":[], "meaning":[]}
270-
with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
271+
272+
_THAI_DICT = {"word": [], "meaning": []}
273+
with open(
274+
get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
275+
) as csvfile:
271276
reader = csv.DictReader(csvfile, delimiter=",")
272277
for row in reader:
273278
_THAI_DICT["word"].append(row["word"])
@@ -288,38 +293,46 @@ def thai_wsd_dict() -> dict:
288293
global _THAI_WSD_DICT
289294
if not _THAI_WSD_DICT:
290295
_thai_wsd = thai_dict()
291-
_THAI_WSD_DICT = {"word":[],"meaning":[]}
292-
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
296+
_THAI_WSD_DICT = {"word": [], "meaning": []}
297+
for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
293298
_all_value = list(eval(j).values())
294299
_use = []
295300
for k in _all_value:
296301
_use.extend(k)
297-
_use=list(set(_use))
298-
if len(_use)>1:
302+
_use = list(set(_use))
303+
if len(_use) > 1:
299304
_THAI_WSD_DICT["word"].append(i)
300305
_THAI_WSD_DICT["meaning"].append(_use)
301306

302307
return _THAI_WSD_DICT
303308

304309

305-
def thai_synonym() -> dict:
310+
def thai_synonyms() -> dict:
306311
"""
307-
Return Thai synonym.
312+
Return Thai synonyms.
308313
\n(See: `thai_synonym\
309314
<https://pythainlp.github.io/pythainlp-corpus/thai_synonym.html>`_)
310315
311316
:return: Thai words with part-of-speech type and synonym
312317
:rtype: dict
313318
"""
314-
global _THAI_SYNONYM
315-
if _THAI_SYNONYM is None:
319+
global _THAI_SYNONYMS
320+
if not _THAI_SYNONYMS:
316321
import csv
317-
_THAI_SYNONYM = {"word":[], "pos":[], "synonym":[]}
318-
with open(get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8") as csvfile:
322+
323+
_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
324+
with open(
325+
get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
326+
) as csvfile:
319327
reader = csv.DictReader(csvfile, delimiter=",")
320328
for row in reader:
321-
_THAI_SYNONYM["word"].append(row["word"])
322-
_THAI_SYNONYM["pos"].append(row["pos"])
323-
_THAI_SYNONYM["synonym"].append(row["synonym"].split("|"))
329+
_THAI_SYNONYMS["word"].append(row["word"])
330+
_THAI_SYNONYMS["pos"].append(row["pos"])
331+
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
332+
333+
return _THAI_SYNONYMS
324334

325-
return _THAI_SYNONYM
335+
336+
def thai_synonym() -> dict:
337+
warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning)
338+
return thai_synonyms()

0 commit comments

Comments
 (0)