diff --git a/docs/api/util.rst b/docs/api/util.rst index ee4933054..9a0d85c3a 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -9,6 +9,7 @@ Modules .. autofunction:: arabic_digit_to_thai_digit .. autofunction:: bahttext +.. autofunction:: convert_years .. autofunction:: collate .. autofunction:: dict_trie .. autofunction:: digit_to_text @@ -40,6 +41,7 @@ Modules .. autofunction:: text_to_num .. autofunction:: text_to_thai_digit .. autofunction:: thai_strftime +.. autofunction:: thai_strptime .. autofunction:: thai_to_eng .. autofunction:: thai_word_tone_detector .. autofunction:: thai_digit_to_arabic_digit diff --git a/pythainlp/parse/ud_goeswith.py b/pythainlp/parse/ud_goeswith.py index 297a2b2ad..eb9b27810 100644 --- a/pythainlp/parse/ud_goeswith.py +++ b/pythainlp/parse/ud_goeswith.py @@ -18,87 +18,87 @@ class Parse: - def __init__( - self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith" - ) -> None: - if model is None: - model = "KoichiYasuoka/deberta-base-thai-ud-goeswith" - self.tokenizer = AutoTokenizer.from_pretrained(model) - self.model = AutoModelForTokenClassification.from_pretrained(model) + def __init__( + self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith" + ) -> None: + if model is None: + model = "KoichiYasuoka/deberta-base-thai-ud-goeswith" + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model) - def __call__( - self, - text: str, tag: str = "str" - ) -> Union[List[List[str]], str]: - w = self.tokenizer(text, return_offsets_mapping=True) - v = w["input_ids"] - x = [ - v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[j] - for i, j in enumerate(v[1:-1], 1) - ] - with torch.no_grad(): - e = self.model(input_ids=torch.tensor(x)).logits.numpy()[:, 1:-2, :] - r = [ - 1 if i == 0 else -1 - if j.endswith("|root") else 0 - for i, j in sorted(self.model.config.id2label.items()) - ] - e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan) - g = self.model.config.label2id["X|_|goeswith"] - r = np.tri(e.shape[0]) - for i in range(e.shape[0]): - for j in range(i+2, e.shape[1]): - r[i, j] = r[i, j-1] if np.nanargmax(e[i, j-1]) == g else 1 - e[:, :, g] += np.where(r == 0, 0, np.nan) - m = np.full((e.shape[0]+1, e.shape[1]+1), np.nan) - m[1:, 1:] = np.nanmax(e, axis=2).transpose() - p = np.zeros(m.shape) - p[1:, 1:] = np.nanargmax(e, axis=2).transpose() - for i in range(1, m.shape[0]): - m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i] - h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] - if [0 for i in h if i == 0] != [0]: - m[:, 0] += np.where( - m[:, 0] == np.nanmax( - m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan - ) - m[[i for i, j in enumerate(h) if j == 0]] += [ - 0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h) - ] - h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] - u = "" - v = [(s, e) for s, e in w["offset_mapping"] if s < e] - if tag == "list": - _tag_data = [] - for i, (s, e) in enumerate(v, 1): - q = self.model.config.id2label[p[i, h[i]]].split("|") - _tag_data.append( - [ - str(i), - text[s:e], - "_", - q[0], - "_", - "|".join(q[1:-1]), - str(h[i]), - q[-1], - "_", - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No" - ] - ) - return _tag_data - else: - for i, (s, e) in enumerate(v, 1): - q = self.model.config.id2label[p[i, h[i]]].split("|") - u += "\t".join([ - str(i), - text[s:e], - "_", - q[0], - "_", - "|".join(q[1:-1]), - str(h[i]), - q[-1], - "_", - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"])+"\n" - return u+"\n" + def __call__( + self, + text: str, tag: str = "str" + ) -> Union[List[List[str]], str]: + w = self.tokenizer(text, return_offsets_mapping=True) + v = w["input_ids"] + x = [ + v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1:] + [j] + for i, j in enumerate(v[1:-1], 1) + ] + with torch.no_grad(): + e = self.model(input_ids=torch.tensor(x) + ).logits.numpy()[:, 1:-2, :] + r = [ + 1 if i == 0 else -1 + if j.endswith("|root") else 0 + for i, j in sorted(self.model.config.id2label.items()) + ] + e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan) + g = self.model.config.label2id["X|_|goeswith"] + r = np.tri(e.shape[0]) + for i in range(e.shape[0]): + for j in range(i + 2, e.shape[1]): + r[i, j] = r[i, j - 1] if np.nanargmax(e[i, j - 1]) == g else 1 + e[:, :, g] += np.where(r == 0, 0, np.nan) + m = np.full((e.shape[0] + 1, e.shape[1] + 1), np.nan) + m[1:, 1:] = np.nanmax(e, axis=2).transpose() + p = np.zeros(m.shape) + p[1:, 1:] = np.nanargmax(e, axis=2).transpose() + for i in range(1, m.shape[0]): + m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i] + h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] + if [0 for i in h if i == 0] != [0]: + m[:, 0] += np.where( + m[:, 0] == np.nanmax( + m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan + ) + m[[i for i, j in enumerate(h) if j == 0]] += [ + 0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h) + ] + h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] + u = "" + v = [(s, e) for s, e in w["offset_mapping"] if s < e] + if tag == "list": + _tag_data = [] + for i, (s, e) in enumerate(v, 1): + q = self.model.config.id2label[p[i, h[i]]].split("|") + _tag_data.append( + [ + str(i), + text[s:e], + "_", + q[0], + "_", + "|".join(q[1:-1]), + str(h[i]), + q[-1], + "_", + "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No" + ] + ) + return _tag_data + else: + for i, (s, e) in enumerate(v, 1): + q = self.model.config.id2label[p[i, h[i]]].split("|") + u += "\t".join([str(i), + text[s:e], + "_", + q[0], + "_", + "|".join(q[1:-1]), + str(h[i]), + q[-1], + "_", + "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"]) + "\n" + return u + "\n" diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index b74fb31ca..a2f9e9e95 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -7,6 +7,7 @@ "Trie", "arabic_digit_to_thai_digit", "bahttext", + "convert_years", "collate", "countthai", "count_thai_chars", @@ -34,6 +35,7 @@ "text_to_thai_digit", "thai_digit_to_arabic_digit", "thai_keyboard_dist", + "thai_strptime", "thai_strftime", "thai_to_eng", "thai_word_tone_detector", @@ -54,6 +56,8 @@ now_reign_year, reign_year_to_ad, thaiword_to_date, + convert_years, + thai_strptime, ) from pythainlp.util.digitconv import ( arabic_digit_to_thai_digit, diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index 90f3203c4..f44edb10a 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -11,15 +11,24 @@ # ไม่ได้รองรับปี พ.ศ. ก่อนการเปลี่ยนวันขึ้นปีใหม่ของประเทศไทย __all__ = [ + "convert_years", "thai_abbr_months", "thai_abbr_weekdays", "thai_full_months", "thai_full_weekdays", + "thai_strptime", "thaiword_to_date", ] from datetime import datetime, timedelta from typing import Union +import re + +try: + from zoneinfo import ZoneInfo +except ImportError: + from backports.zoneinfo import ZoneInfo + thai_abbr_weekdays = ["จ", "อ", "พ", "พฤ", "ศ", "ส", "อา"] thai_full_weekdays = [ @@ -60,6 +69,29 @@ "พฤศจิกายน", "ธันวาคม", ] +thai_full_month_lists = [ + ["มกราคม", "มกรา", "ม.ค.", "01", "1"], + ["กุมภาพันธ์", "กุมภา", "ก.w.", "02", "2"], + ["มีนาคม", "มีนา", "มี.ค.", "03", "3"], + ["เมษายน", "เมษา", "เม.ย.", "04", "4"], + ["พฤษภาคม", "พฤษภา", "พ.ค.", "05", "5"], + ["มิถุนายน", "มิถุนา", "มิ.ย.", "06", "6"], + ["กรกฎาคม", "ก.ค.", "07", "7"], + ["สิงหาคม", "สิงหา", "ส.ค.", "08", "8"], + ["กันยายน", "กันยา", "ก.ย.", "09", "9"], + ["ตุลาคม", "ตุลา", "ต.ค.", "10"], + ["พฤศจิกายน", "พฤศจิกา", "พ.ย.", "11"], + ["ธันวาคม", "ธันวา", "ธ.ค.", "12"] +] +thai_full_month_lists_regex = "(" + '|'.join( + [str('|'.join([j for j in i])) for i in thai_full_month_lists] +) + ")" +year_all_regex = r"(\d\d\d\d|\d\d)" +dates_list = "(" + '|'.join( + [str(i) for i in range(32, 0, -1)] + [ + "0" + str(i) for i in range(1, 10) + ] +) + ")" _DAY = { "วันนี้": 0, @@ -84,6 +116,196 @@ } +def convert_years(year: str, src="be", target="ad") -> str: + """ + Convert years + + :param int year: year + :param str src: The src year + :param str target: The target year + :return: The years that be convert + :rtype: str + + **Options for year** + * *be* - Buddhist calendar + * *ad* - Anno Domini + * *re* - Rattanakosin era + * *ah* - Anno Hejira + + **Warning**: This function works properly only after 1941 \ + because Thailand has change the Thai calendar in 1941. + If you are the time traveler or the historian, \ + you should care about the correct calendar. + """ + output_year = None + if src == "be": + # พ.ศ. - 543  = ค.ศ. + if target == "ad": + output_year = str(int(year) - 543) + # พ.ศ. - 2324 = ร.ศ.  + elif target == "re": + output_year = str(int(year) - 2324) + # พ.ศ. - 1122 = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) - 1122) + elif src == "ad": + # ค.ศ. + 543 = พ.ศ. + if target == "be": + output_year = str(int(year) + 543) + # ค.ศ. + 543 - 2324 = ร.ศ. + elif target == "re": + output_year = str(int(year) + 543 - 2324) + # ค.ศ. +543- 1122   = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) + 543 - 1122) + elif src == "re": + # ร.ศ. + 2324 = พ.ศ. + if target == "be": + output_year = str(int(year) + 2324) + # ร.ศ. + 2324 - 543  = ค.ศ. + elif target == "ad": + output_year = str(int(year) + 2324 - 543) + # ร.ศ. + 2324 - 1122  = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) + 2324 - 1122) + elif src == "ah": + # ฮ.ศ. + 1122 = พ.ศ. + if target == "be": + output_year = str(int(year) + 1122) + # ฮ.ศ. +1122 - 543= ค.ศ. + elif target == "ad": + output_year = str(int(year) + 1122 - 543) + # ฮ.ศ. +1122 - 2324 = ร.ศ. + elif target == "re": + output_year = str(int(year) + 1122 - 2324) + if output_year is None: + raise NotImplementedError( + f"This function doesn't support {src} to {target}" + ) + return output_year + + +def _find_month(text): + for i, m in enumerate(thai_full_month_lists): + for j in m: + if j in text: + return i + 1 + + +def thai_strptime( + text: str, + fmt: str, + year: str = "be", + add_year: int = None, + tzinfo=ZoneInfo("Asia/Bangkok") +): + """ + Thai strptime + + :param str text: text + :param str fmt: string containing date and time directives + :param str year: year of the text \ + (ad isAnno Domini and be is Buddhist calendar) + :param int add_year: add year convert to ad + :param object tzinfo: tzinfo (default is Asia/Bangkok) + :return: The years that be convert to datetime.datetime + :rtype: datetime.datetime + + The fmt char that support: + * *%d* - Day (1 - 31) + * *%B* - Thai month (03, 3, มี.ค., or มีนาคม) + * *%Y* - Year (66, 2566, or 2023) + * *%H* - Hour (0 - 23) + * *%M* - Minute (0 - 59) + * *%S* - Second (0 - 59) + * *%f* - Microsecond + + :Example: + :: + + from pythainlp.util import thai_strptime + + thai_strptime("15 ก.ค. 2565 09:00:01","%d %B %Y %H:%M:%S") + # output: + # datetime.datetime( + # 2022, + # 7, + # 15, + # 9, + # 0, + # 1, + # tzinfo=backports.zoneinfo.ZoneInfo(key='Asia/Bangkok') + # ) + """ + d = "" + m = "" + y = "" + fmt = fmt.replace("%-m", "%m") + fmt = fmt.replace("%-d", "%d") + fmt = fmt.replace("%b", "%B") + fmt = fmt.replace("%-y", "%y") + data = {} + _old = fmt + if "%d" in fmt: + fmt = fmt.replace("%d", dates_list) + if "%B" in fmt: + fmt = fmt.replace("%B", thai_full_month_lists_regex) + if "%Y" in fmt: + fmt = fmt.replace("%Y", year_all_regex) + if "%H" in fmt: + fmt = fmt.replace("%H", r"(\d\d|\d)") + if "%M" in fmt: + fmt = fmt.replace("%M", r"(\d\d|\d)") + if "%S" in fmt: + fmt = fmt.replace("%S", r"(\d\d|\d)") + if "%f" in fmt: + fmt = fmt.replace("%f", r"(\d+)") + keys = [ + i.strip().strip('-').strip(':').strip('.') + for i in _old.split("%") if i != '' + ] + y = re.findall(fmt, text) + + data = {i: ''.join(list(j)) for i, j in zip(keys, y[0])} + H = 0 + M = 0 + S = 0 + f = 0 + d = data['d'] + m = _find_month(data['B']) + y = data['Y'] + if "H" in keys: + H = data['H'] + if "M" in keys: + M = data['M'] + if "S" in keys: + S = data['S'] + if "f" in keys: + f = data['f'] + if int(y) < 100 and year == "be": + if add_year is None: + y = str(2500 + int(y)) + else: + y = str(int(add_year) + int(y)) + elif int(y) < 100 and year == "ad": + if add_year is None: + y = str(2000 + int(y)) + else: + y = str(int(add_year) + int(y)) + if year == "be": + y = convert_years(y, src="be", target="ad") + return datetime( + year=int(y), + month=int(m), + day=int(d), + hour=int(H), + minute=int(M), + second=int(S), + microsecond=int(f), + tzinfo=tzinfo + ) + + def now_reign_year() -> int: """ Return the reign year of the 10th King of Chakri dynasty. diff --git a/setup.py b/setup.py index 681e6036c..a72ad8c9c 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ requirements = [ "requests>=2.22.0", + "backports.zoneinfo; python_version<'3.9'" ] extras = { diff --git a/tests/test_util.py b/tests/test_util.py index 342ba0cca..ead1b3a14 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -52,6 +52,8 @@ syllable_open_close_detector, tone_detector, thai_word_tone_detector, + convert_years, + thai_strptime, ) @@ -600,7 +602,7 @@ def test_count_thai_chars(self): self.assertEquals( count_thai_chars("ทดสอบภาษาไทย"), { - 'vowels': 3, + 'vowels': 3, 'lead_vowels': 1, 'follow_vowels': 2, 'above_vowels': 0, @@ -776,3 +778,51 @@ def test_thai_word_tone_detector(self): thai_word_tone_detector("ราคา"), [('รา', 'm'), ('คา', 'm')] ) + + def test_thai_strptime(self): + self.assertIsNotNone( + thai_strptime( + "05-7-65 09:00:01.10600", + "%d-%B-%Y %H:%M:%S.%f", + year="be" + ) + ) + self.assertIsNotNone( + thai_strptime( + "24-6-75 09:00:00", + "%d-%B-%Y %H:%M:%S", + year="be", + add_year="2400" + ) + ) + self.assertIsNotNone( + thai_strptime( + "05-7-22 09:00:01.10600", + "%d-%B-%Y %H:%M:%S.%f", + year="ad" + ) + ) + self.assertIsNotNone( + thai_strptime( + "05-7-99 09:00:01.10600", + "%d-%B-%Y %H:%M:%S.%f", + year="ad", + add_year="1900" + ) + ) + + def test_convert_years(self): + self.assertEqual(convert_years("2566", src="be", target="ad"), "2023") + self.assertEqual(convert_years("2566", src="be", target="re"), "242") + self.assertEqual(convert_years("2566", src="be", target="ah"), "1444") + self.assertEqual(convert_years("2023", src="ad", target="be"), "2566") + self.assertEqual(convert_years("2023", src="ad", target="ah"), "1444") + self.assertEqual(convert_years("2023", src="ad", target="re"), "242") + self.assertEqual(convert_years("1444", src="ah", target="be"), "2566") + self.assertEqual(convert_years("1444", src="ah", target="ad"), "2023") + self.assertEqual(convert_years("1444", src="ah", target="re"), "242") + self.assertEqual(convert_years("242", src="re", target="be"), "2566") + self.assertEqual(convert_years("242", src="re", target="ad"), "2023") + self.assertEqual(convert_years("242", src="re", target="ah"), "1444") + with self.assertRaises(NotImplementedError): + self.assertIsNotNone(convert_years("2023", src="cat", target="dog"))