PyThaiNLP · wannaphong · Jan 22, 2023 · Jan 15, 2023 · Jan 15, 2023 · Jan 15, 2023
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -9,6 +9,7 @@ Modules
 
 .. autofunction:: arabic_digit_to_thai_digit
 .. autofunction:: bahttext
+.. autofunction:: convert_years
 .. autofunction:: collate
 .. autofunction:: dict_trie
 .. autofunction:: digit_to_text
@@ -40,6 +41,7 @@ Modules
 .. autofunction:: text_to_num
 .. autofunction:: text_to_thai_digit
 .. autofunction:: thai_strftime
+.. autofunction:: thai_strptime
 .. autofunction:: thai_to_eng
 .. autofunction:: thai_word_tone_detector
 .. autofunction:: thai_digit_to_arabic_digit

diff --git a/pythainlp/parse/ud_goeswith.py b/pythainlp/parse/ud_goeswith.py
@@ -18,87 +18,87 @@
 
 
 class Parse:
-  def __init__(
-    self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
-  ) -> None:
-    if model is None:
-      model = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
-    self.tokenizer = AutoTokenizer.from_pretrained(model)
-    self.model = AutoModelForTokenClassification.from_pretrained(model)
+    def __init__(
+        self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
+    ) -> None:
+        if model is None:
+            model = "KoichiYasuoka/deberta-base-thai-ud-goeswith"
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModelForTokenClassification.from_pretrained(model)
 
-  def __call__(
-    self,
-    text: str, tag: str = "str"
-  ) -> Union[List[List[str]], str]:
-    w = self.tokenizer(text, return_offsets_mapping=True)
-    v = w["input_ids"]
-    x = [
-      v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[j]
-      for i, j in enumerate(v[1:-1], 1)
-    ]
-    with torch.no_grad():
-      e = self.model(input_ids=torch.tensor(x)).logits.numpy()[:, 1:-2, :]
-    r = [
-      1 if i == 0 else -1
-      if j.endswith("|root") else 0
-      for i, j in sorted(self.model.config.id2label.items())
-    ]
-    e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan)
-    g = self.model.config.label2id["X|_|goeswith"]
-    r = np.tri(e.shape[0])
-    for i in range(e.shape[0]):
-      for j in range(i+2, e.shape[1]):
-        r[i, j] = r[i, j-1] if np.nanargmax(e[i, j-1]) == g else 1
-    e[:, :, g] += np.where(r == 0, 0, np.nan)
-    m = np.full((e.shape[0]+1, e.shape[1]+1), np.nan)
-    m[1:, 1:] = np.nanmax(e, axis=2).transpose()
-    p = np.zeros(m.shape)
-    p[1:, 1:] = np.nanargmax(e, axis=2).transpose()
-    for i in range(1, m.shape[0]):
-      m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i]
-    h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
-    if [0 for i in h if i == 0] != [0]:
-      m[:, 0] += np.where(
-        m[:, 0] == np.nanmax(
-          m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan
-      )
-      m[[i for i, j in enumerate(h) if j == 0]] += [
-        0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h)
-      ]
-      h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
-    u = ""
-    v = [(s, e) for s, e in w["offset_mapping"] if s < e]
-    if tag == "list":
-      _tag_data = []
-      for i, (s, e) in enumerate(v, 1):
-        q = self.model.config.id2label[p[i, h[i]]].split("|")
-        _tag_data.append(
-          [
-            str(i),
-            text[s:e],
-            "_",
-            q[0],
-            "_",
-            "|".join(q[1:-1]),
-            str(h[i]),
-            q[-1],
-            "_",
-            "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"
-          ]
-        )
-      return _tag_data
-    else:
-      for i, (s, e) in enumerate(v, 1):
-        q = self.model.config.id2label[p[i, h[i]]].split("|")
-        u += "\t".join([
-          str(i),
-          text[s:e],
-          "_",
-          q[0],
-          "_",
-          "|".join(q[1:-1]),
-          str(h[i]),
-          q[-1],
-          "_",
-          "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"])+"\n"
-      return u+"\n"
+    def __call__(
+        self,
+        text: str, tag: str = "str"
+    ) -> Union[List[List[str]], str]:
+        w = self.tokenizer(text, return_offsets_mapping=True)
+        v = w["input_ids"]
+        x = [
+            v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1:] + [j]
+            for i, j in enumerate(v[1:-1], 1)
+        ]
+        with torch.no_grad():
+            e = self.model(input_ids=torch.tensor(x)
+                           ).logits.numpy()[:, 1:-2, :]
+        r = [
+            1 if i == 0 else -1
+            if j.endswith("|root") else 0
+            for i, j in sorted(self.model.config.id2label.items())
+        ]
+        e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan)
+        g = self.model.config.label2id["X|_|goeswith"]
+        r = np.tri(e.shape[0])
+        for i in range(e.shape[0]):
+            for j in range(i + 2, e.shape[1]):
+                r[i, j] = r[i, j - 1] if np.nanargmax(e[i, j - 1]) == g else 1
+        e[:, :, g] += np.where(r == 0, 0, np.nan)
+        m = np.full((e.shape[0] + 1, e.shape[1] + 1), np.nan)
+        m[1:, 1:] = np.nanmax(e, axis=2).transpose()
+        p = np.zeros(m.shape)
+        p[1:, 1:] = np.nanargmax(e, axis=2).transpose()
+        for i in range(1, m.shape[0]):
+            m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i]
+        h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        if [0 for i in h if i == 0] != [0]:
+            m[:, 0] += np.where(
+                m[:, 0] == np.nanmax(
+                    m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan
+            )
+            m[[i for i, j in enumerate(h) if j == 0]] += [
+                0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h)
+            ]
+            h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        u = ""
+        v = [(s, e) for s, e in w["offset_mapping"] if s < e]
+        if tag == "list":
+            _tag_data = []
+            for i, (s, e) in enumerate(v, 1):
+                q = self.model.config.id2label[p[i, h[i]]].split("|")
+                _tag_data.append(
+                    [
+                        str(i),
+                        text[s:e],
+                        "_",
+                        q[0],
+                        "_",
+                        "|".join(q[1:-1]),
+                        str(h[i]),
+                        q[-1],
+                        "_",
+                        "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"
+                    ]
+                )
+            return _tag_data
+        else:
+            for i, (s, e) in enumerate(v, 1):
+                q = self.model.config.id2label[p[i, h[i]]].split("|")
+                u += "\t".join([str(i),
+                                text[s:e],
+                                "_",
+                                q[0],
+                                "_",
+                                "|".join(q[1:-1]),
+                                str(h[i]),
+                                q[-1],
+                                "_",
+                                "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"]) + "\n"
+            return u + "\n"
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -7,6 +7,7 @@
     "Trie",
     "arabic_digit_to_thai_digit",
     "bahttext",
+    "convert_years",
     "collate",
     "countthai",
     "count_thai_chars",
@@ -34,6 +35,7 @@
     "text_to_thai_digit",
     "thai_digit_to_arabic_digit",
     "thai_keyboard_dist",
+    "thai_strptime",
     "thai_strftime",
     "thai_to_eng",
     "thai_word_tone_detector",
@@ -54,6 +56,8 @@
     now_reign_year,
     reign_year_to_ad,
     thaiword_to_date,
+    convert_years,
+    thai_strptime,
 )
 from pythainlp.util.digitconv import (
     arabic_digit_to_thai_digit,