Merge pull request #1027 from PyThaiNLP/release-5.0.5

wannaphong · web-flow · commit 0c2095680354 · 2024-12-14T21:18:22.000+07:00
PyThaiNLP v5.0.5
diff --git a/CITATION.cff b/CITATION.cff
@@ -20,6 +20,6 @@ authors:
   given-names: "Pattarawat"
   orcid: "https://orcid.org/0000-0000-0000-0000"
 title: "PyThaiNLP: Thai Natural Language Processing in Python"
-version: v5.0.4
+version: v5.0.5
 license: Apache-2.0
 date-released: 2024-06-02
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
 
 | Version | Description | Status |
 |:------:|:--:|:------:|
-| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
+| [5.0.5](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
 | [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) |
 
 ## Getting Started
diff --git a/README_TH.md b/README_TH.md
@@ -20,7 +20,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
 
 | รุ่น | คำอธิบาย | สถานะ |
 |:------:|:--:|:------:|
-| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
+| [5.0.5](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
 | [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1  | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) |
 
 ติดตามพวกเราบน [PyThaiNLP Facebook page](https://www.facebook.com/pythainlp/) เพื่อรับข่าวสารเพิ่มเติม
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
-__version__ = "5.0.4"
+__version__ = "5.0.5"
 
 thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"  # 44 chars
 
diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py
@@ -5,12 +5,10 @@
 pythainlp.cls
 Depreciated. Use pythainlp.classify instead.
 """
-import warnings
 
 __all__ = ["GzipModel"]
 
 from pythainlp.classify.param_free import GzipModel
+from pythainlp.tools import warn_deprecation
 
-warnings.warn(
-    "Deprecated: Use pythainlp.classify instead.", DeprecationWarning
-)
+warn_deprecation("pythainlp.cls", "pythainlp.classify", "5.1", "5.2")
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
@@ -24,9 +24,9 @@
 ]
 
 from typing import FrozenSet, List, Union
-import warnings
 
 from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path
+from pythainlp.tools import warn_deprecation
 
 _THAI_COUNTRIES: FrozenSet[str] = frozenset()
 _THAI_COUNTRIES_FILENAME = "countries_th.txt"
@@ -56,9 +56,9 @@
 
 _THAI_ORST_WORDS: FrozenSet[str] = frozenset()
 
-_THAI_DICT = {}
-_THAI_WSD_DICT = {}
-_THAI_SYNONYMS = {}
+_THAI_DICT: dict[str, list] = {}
+_THAI_WSD_DICT: dict[str, list] = {}
+_THAI_SYNONYMS: dict[str, list] = {}
 
 
 def countries() -> FrozenSet[str]:
@@ -336,7 +336,12 @@ def thai_synonyms() -> dict:
 
 
 def thai_synonym() -> dict:
-    warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning)
+    warn_deprecation(
+        "pythainlp.corpus.thai_synonym",
+        "pythainlp.corpus.thai_synonyms",
+        "5.1",
+        "5.2",
+    )
     return thai_synonyms()
 
 
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
@@ -394,7 +394,7 @@ def get_ner(
         if pos:
             warnings.warn(
                 "This model doesn't support output \
-                          postag and It doesn't output the postag."
+                          postag and it doesn't output the postag."
             )
 
         sample_output = []
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -4,8 +4,8 @@
 """
 Generic functions of tokenizers
 """
+
 import re
-import warnings
 from typing import Iterable, List, Union
 
 from pythainlp.tokenize import (
@@ -21,6 +21,7 @@
     rejoin_formatted_num,
     strip_whitespace,
 )
+from pythainlp.tools import warn_deprecation
 from pythainlp.util.trie import Trie, dict_trie
 
 
@@ -45,13 +46,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
         # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
         # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
     """
+    warn_deprecation("pythainlp.util.clause_tokenize", "", "5.0.5", "5.1")
     from pythainlp.tokenize.crfcls import segment
 
-    warnings.warn(
-        """
-                  clause_tokenize is no longer supported \
-                  and will be removed in version 5.1.
-        """, DeprecationWarning)
     return segment(doc)
 
 
@@ -71,6 +68,7 @@ def word_detokenize(
     ::
 
         from pythainlp.tokenize import word_detokenize
+
         print(word_detokenize(["เรา", "เล่น"]))
         # output: เราเล่น
     """
@@ -299,18 +297,19 @@ def word_tokenize(
         segments = segment(text)
     elif engine == "nlpo3":
         from pythainlp.tokenize.nlpo3 import segment
+
         # Currently cannot handle custom_dict from inside word_tokenize(),
         # due to difference in type.
-        #if isinstance(custom_dict, str):
+        # if isinstance(custom_dict, str):
         #    segments = segment(text, custom_dict=custom_dict)
-        #elif not isinstance(custom_dict, str) and not custom_dict:
+        # elif not isinstance(custom_dict, str) and not custom_dict:
         #    raise ValueError(
         #        f"""Tokenizer \"{engine}\":
         #        custom_dict must be a str.
         #        It is a dictionary name as assigned with load_dict().
         #        See pythainlp.tokenize.nlpo3.load_dict()"""
         #    )
-        #else:
+        # else:
         #    segments = segment(text)
         segments = segment(text)
     else:
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -6,8 +6,12 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
+    "safe_print",
+    "warn_deprecation",
 ]
 
+from pythainlp.tools.core import safe_print, warn_deprecation
+
 from pythainlp.tools.path import (
     PYTHAINLP_DEFAULT_DATA_DIR,
     get_full_data_path,
diff --git a/pythainlp/tools/core.py b/pythainlp/tools/core.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+"""
+Generic support functions for PyThaiNLP.
+"""
+
+import sys
+import warnings
+
+
+def warn_deprecation(
+    deprecated_func: str,
+    replacing_func: str = "",
+    deprecated_version: str = "",
+    removal_version: str = "",
+):
+    """Warn about the deprecation of a function.
+
+    :param str deprecated_func: Name of the deprecated function.
+    :param str replacing_func: Name of the function to use instead (optional).
+    :param str deprecated_version: Version in which the function will be deprecated (optional).
+    :param str removal_version: Version in which the function will be removed (optional).
+    """
+    message = f"The '{deprecated_func}' function is deprecated"
+    if deprecated_version:
+        message += f" since {deprecated_version}"
+    if not removal_version:
+        removal_version = "a future release"
+    message += f" and will be removed in {removal_version}."
+    if replacing_func:
+        message += f" Please use '{replacing_func}' instead."
+    warnings.warn(message, DeprecationWarning, stacklevel=2)
+
+
+def safe_print(text: str):
+    """Print text to console, handling UnicodeEncodeError.
+
+    :param text: Text to print.
+    :type text: str
+    """
+    try:
+        print(text)
+    except UnicodeEncodeError:
+        print(
+            text.encode(sys.stdout.encoding, errors="replace").decode(
+                sys.stdout.encoding
+            )
+        )
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -19,11 +19,13 @@
     "display_thai_char",
     "emoji_to_thai",
     "eng_to_thai",
+    "expand_maiyamok",
     "find_keyword",
     "ipa_to_rtgs",
     "is_native_thai",
     "isthai",
     "isthaichar",
+    "maiyamok",
     "nectec_to_ipa",
     "normalize",
     "now_reign_year",
@@ -85,8 +87,9 @@
 from pythainlp.util.emojiconv import emoji_to_thai
 from pythainlp.util.keywords import find_keyword, rank
 from pythainlp.util.normalize import (
-    normalize,
+    expand_maiyamok,
     maiyamok,
+    normalize,
     remove_dangling,
     remove_dup_spaces,
     remove_repeat_vowels,
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -394,7 +394,7 @@ def get_ner(`
`394`	`394`	`if pos:`
`395`	`395`	`warnings.warn(`
`396`	`396`	`"This model doesn't support output \`
`397`		`- postag and It doesn't output the postag."`
	`397`	`+ postag and it doesn't output the postag."`
`398`	`398`	`)`
`399`	`399`
`400`	`400`	`sample_output = []`