Skip to content

Commit 2ccae92

Browse files
authored
Merge pull request #767 from PyThaiNLP/Add-thai-text-strptime
Add thai_strptime and convert_years
2 parents 4ab518d + b6e8ec9 commit 2ccae92

File tree

6 files changed

+363
-84
lines changed

6 files changed

+363
-84
lines changed

docs/api/util.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Modules
99

1010
.. autofunction:: arabic_digit_to_thai_digit
1111
.. autofunction:: bahttext
12+
.. autofunction:: convert_years
1213
.. autofunction:: collate
1314
.. autofunction:: dict_trie
1415
.. autofunction:: digit_to_text
@@ -40,6 +41,7 @@ Modules
4041
.. autofunction:: text_to_num
4142
.. autofunction:: text_to_thai_digit
4243
.. autofunction:: thai_strftime
44+
.. autofunction:: thai_strptime
4345
.. autofunction:: thai_to_eng
4446
.. autofunction:: thai_word_tone_detector
4547
.. autofunction:: thai_digit_to_arabic_digit

pythainlp/parse/ud_goeswith.py

Lines changed: 83 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -18,87 +18,87 @@
1818

1919

2020
class Parse:
21-
def __init__(
22-
self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
23-
) -> None:
24-
if model is None:
25-
model = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
26-
self.tokenizer = AutoTokenizer.from_pretrained(model)
27-
self.model = AutoModelForTokenClassification.from_pretrained(model)
21+
def __init__(
22+
self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
23+
) -> None:
24+
if model is None:
25+
model = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
26+
self.tokenizer = AutoTokenizer.from_pretrained(model)
27+
self.model = AutoModelForTokenClassification.from_pretrained(model)
2828

29-
def __call__(
30-
self,
31-
text: str, tag: str = "str"
32-
) -> Union[List[List[str]], str]:
33-
w = self.tokenizer(text, return_offsets_mapping=True)
34-
v = w["input_ids"]
35-
x = [
36-
v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[j]
37-
for i, j in enumerate(v[1:-1], 1)
38-
]
39-
with torch.no_grad():
40-
e = self.model(input_ids=torch.tensor(x)).logits.numpy()[:, 1:-2, :]
41-
r = [
42-
1 if i == 0 else -1
43-
if j.endswith("|root") else 0
44-
for i, j in sorted(self.model.config.id2label.items())
45-
]
46-
e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan)
47-
g = self.model.config.label2id["X|_|goeswith"]
48-
r = np.tri(e.shape[0])
49-
for i in range(e.shape[0]):
50-
for j in range(i+2, e.shape[1]):
51-
r[i, j] = r[i, j-1] if np.nanargmax(e[i, j-1]) == g else 1
52-
e[:, :, g] += np.where(r == 0, 0, np.nan)
53-
m = np.full((e.shape[0]+1, e.shape[1]+1), np.nan)
54-
m[1:, 1:] = np.nanmax(e, axis=2).transpose()
55-
p = np.zeros(m.shape)
56-
p[1:, 1:] = np.nanargmax(e, axis=2).transpose()
57-
for i in range(1, m.shape[0]):
58-
m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i]
59-
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
60-
if [0 for i in h if i == 0] != [0]:
61-
m[:, 0] += np.where(
62-
m[:, 0] == np.nanmax(
63-
m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan
64-
)
65-
m[[i for i, j in enumerate(h) if j == 0]] += [
66-
0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h)
67-
]
68-
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
69-
u = ""
70-
v = [(s, e) for s, e in w["offset_mapping"] if s < e]
71-
if tag == "list":
72-
_tag_data = []
73-
for i, (s, e) in enumerate(v, 1):
74-
q = self.model.config.id2label[p[i, h[i]]].split("|")
75-
_tag_data.append(
76-
[
77-
str(i),
78-
text[s:e],
79-
"_",
80-
q[0],
81-
"_",
82-
"|".join(q[1:-1]),
83-
str(h[i]),
84-
q[-1],
85-
"_",
86-
"_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"
87-
]
88-
)
89-
return _tag_data
90-
else:
91-
for i, (s, e) in enumerate(v, 1):
92-
q = self.model.config.id2label[p[i, h[i]]].split("|")
93-
u += "\t".join([
94-
str(i),
95-
text[s:e],
96-
"_",
97-
q[0],
98-
"_",
99-
"|".join(q[1:-1]),
100-
str(h[i]),
101-
q[-1],
102-
"_",
103-
"_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"])+"\n"
104-
return u+"\n"
29+
def __call__(
30+
self,
31+
text: str, tag: str = "str"
32+
) -> Union[List[List[str]], str]:
33+
w = self.tokenizer(text, return_offsets_mapping=True)
34+
v = w["input_ids"]
35+
x = [
36+
v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1:] + [j]
37+
for i, j in enumerate(v[1:-1], 1)
38+
]
39+
with torch.no_grad():
40+
e = self.model(input_ids=torch.tensor(x)
41+
).logits.numpy()[:, 1:-2, :]
42+
r = [
43+
1 if i == 0 else -1
44+
if j.endswith("|root") else 0
45+
for i, j in sorted(self.model.config.id2label.items())
46+
]
47+
e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan)
48+
g = self.model.config.label2id["X|_|goeswith"]
49+
r = np.tri(e.shape[0])
50+
for i in range(e.shape[0]):
51+
for j in range(i + 2, e.shape[1]):
52+
r[i, j] = r[i, j - 1] if np.nanargmax(e[i, j - 1]) == g else 1
53+
e[:, :, g] += np.where(r == 0, 0, np.nan)
54+
m = np.full((e.shape[0] + 1, e.shape[1] + 1), np.nan)
55+
m[1:, 1:] = np.nanmax(e, axis=2).transpose()
56+
p = np.zeros(m.shape)
57+
p[1:, 1:] = np.nanargmax(e, axis=2).transpose()
58+
for i in range(1, m.shape[0]):
59+
m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i]
60+
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
61+
if [0 for i in h if i == 0] != [0]:
62+
m[:, 0] += np.where(
63+
m[:, 0] == np.nanmax(
64+
m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan
65+
)
66+
m[[i for i, j in enumerate(h) if j == 0]] += [
67+
0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h)
68+
]
69+
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
70+
u = ""
71+
v = [(s, e) for s, e in w["offset_mapping"] if s < e]
72+
if tag == "list":
73+
_tag_data = []
74+
for i, (s, e) in enumerate(v, 1):
75+
q = self.model.config.id2label[p[i, h[i]]].split("|")
76+
_tag_data.append(
77+
[
78+
str(i),
79+
text[s:e],
80+
"_",
81+
q[0],
82+
"_",
83+
"|".join(q[1:-1]),
84+
str(h[i]),
85+
q[-1],
86+
"_",
87+
"_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"
88+
]
89+
)
90+
return _tag_data
91+
else:
92+
for i, (s, e) in enumerate(v, 1):
93+
q = self.model.config.id2label[p[i, h[i]]].split("|")
94+
u += "\t".join([str(i),
95+
text[s:e],
96+
"_",
97+
q[0],
98+
"_",
99+
"|".join(q[1:-1]),
100+
str(h[i]),
101+
q[-1],
102+
"_",
103+
"_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"]) + "\n"
104+
return u + "\n"

pythainlp/util/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"Trie",
88
"arabic_digit_to_thai_digit",
99
"bahttext",
10+
"convert_years",
1011
"collate",
1112
"countthai",
1213
"count_thai_chars",
@@ -34,6 +35,7 @@
3435
"text_to_thai_digit",
3536
"thai_digit_to_arabic_digit",
3637
"thai_keyboard_dist",
38+
"thai_strptime",
3739
"thai_strftime",
3840
"thai_to_eng",
3941
"thai_word_tone_detector",
@@ -54,6 +56,8 @@
5456
now_reign_year,
5557
reign_year_to_ad,
5658
thaiword_to_date,
59+
convert_years,
60+
thai_strptime,
5761
)
5862
from pythainlp.util.digitconv import (
5963
arabic_digit_to_thai_digit,

0 commit comments

Comments
 (0)