From 61f8f5ffae93789a7e31a102f8d1d87df9283324 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 8 Jan 2025 23:56:00 +0700 Subject: [PATCH 1/6] Add longest common subsequence algorithm Fixes #211 Add the longest common subsequence algorithm to the `pythainlp.util` module. * Create a new file `pythainlp/util/lcs.py` to implement the longest common subsequence algorithm. * Define a function `longest_common_subsequence` that takes two strings as input and returns their longest common subsequence. * Update `pythainlp/util/__init__.py` to import the `longest_common_subsequence` function from `pythainlp.util.lcs` and add it to the `__all__` list. * Add unit tests for the `longest_common_subsequence` function in `tests/core/test_util.py` to ensure the correctness of the implementation. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/PyThaiNLP/pythainlp/issues/211?shareId=XXXX-XXXX-XXXX-XXXX). --- pythainlp/util/__init__.py | 5 ++- pythainlp/util/lcs.py | 67 ++++++++++++++++++++++++++++++++++++++ tests/core/test_util.py | 30 +++++++---------- 3 files changed, 82 insertions(+), 20 deletions(-) create mode 100644 pythainlp/util/lcs.py diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 9939ac089..bb1742396 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -1,6 +1,7 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE +# SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 """ Utility functions, like date conversion and digit conversion @@ -26,6 +27,7 @@ "is_native_thai", "isthai", "isthaichar", + "longest_common_subsequence", "nectec_to_ipa", "normalize", "now_reign_year", @@ -92,6 +94,7 @@ thai_to_eng, ) from pythainlp.util.keywords import find_keyword, rank +from pythainlp.util.lcs import longest_common_subsequence from pythainlp.util.normalize import ( maiyamok, normalize, diff --git a/pythainlp/util/lcs.py b/pythainlp/util/lcs.py new file mode 100644 index 000000000..781c26950 --- /dev/null +++ b/pythainlp/util/lcs.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +def longest_common_subsequence(str1: str, str2: str) -> str: + """ + Find the longest common subsequence between two strings. + + :param str str1: The first string. + :param str str2: The second string. + :return: The longest common subsequence. + :rtype: str + + :Example: + :: + + from pythainlp.util.lcs import longest_common_subsequence + + print(longest_common_subsequence("ABCBDAB", "BDCAB")) + # output: "BCAB" + """ + m = len(str1) + n = len(str2) + + # Create a 2D array to store lengths of longest common subsequence. + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Build the dp array from bottom up. + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif str1[i - 1] == str2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + # Following code is used to print LCS + index = dp[m][n] + + # Create a character array to store the lcs string + lcs = [""] * (index + 1) + lcs[index] = "" + + # Start from the right-most-bottom-most corner and + # one by one store characters in lcs[] + i = m + j = n + while i > 0 and j > 0: + + # If current character in str1 and str2 are same, then + # current character is part of LCS + if str1[i - 1] == str2[j - 1]: + lcs[index - 1] = str1[i - 1] + i -= 1 + j -= 1 + index -= 1 + + # If not same, then find the larger of two and + # go in the direction of larger value + elif dp[i - 1][j] > dp[i][j - 1]: + i -= 1 + else: + j -= 1 + + return "".join(lcs) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index f0a8e31ba..ca6b2dd68 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -32,6 +32,7 @@ ipa_to_rtgs, isthai, isthaichar, + longest_common_subsequence, nectec_to_ipa, normalize, now_reign_year, @@ -505,25 +506,6 @@ def test_normalize(self): self.assertEqual(normalize("กา าาะา"), "กาะา") # remove repeating tone marks - self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") - - # remove repeating different tone marks - self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") - self.assertEqual( - normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" - ) - - # remove tone mark at the beginning of text - self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") - self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") - self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") - self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") - - # remove duplicate spaces - self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") - self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") - - # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") @@ -842,3 +824,13 @@ def test_th_zodiac(self): # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) + + def test_longest_common_subsequence(self): + self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BCAB") + self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB") + self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH") + self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC") + self.assertEqual(longest_common_subsequence("ABC", "DEF"), "") + self.assertEqual(longest_common_subsequence("", "ABC"), "") + self.assertEqual(longest_common_subsequence("ABC", ""), "") + self.assertEqual(longest_common_subsequence("", ""), "") From 0cb401251927e7bfc7aaf1e16c41ce6c6fdb2818 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 8 Jan 2025 23:57:21 +0700 Subject: [PATCH 2/6] Update __init__.py --- pythainlp/util/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index bb1742396..f101ffe36 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE -# SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 """ Utility functions, like date conversion and digit conversion From d281b789aac5ccc6a7269d950b97b473ab589d7a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 9 Jan 2025 00:00:05 +0700 Subject: [PATCH 3/6] Update test_util.py --- tests/core/test_util.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index ca6b2dd68..1b83f967d 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -506,6 +506,25 @@ def test_normalize(self): self.assertEqual(normalize("กา าาะา"), "กาะา") # remove repeating tone marks + self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") + + # remove repeating different tone marks + self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") + self.assertEqual( + normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" + ) + + # remove tone mark at the beginning of text + self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") + + # remove duplicate spaces + self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") + self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") + + # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") From 16404936c054c9574033de8a946bb7d189406652 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 9 Jan 2025 00:09:54 +0700 Subject: [PATCH 4/6] Update test_util.py --- tests/core/test_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 1b83f967d..5b8b12a91 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -845,7 +845,7 @@ def test_th_zodiac(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) def test_longest_common_subsequence(self): - self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BCAB") + self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB") self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB") self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH") self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC") From 9e51190175b9793e616c5cc0b9947c2d0e8e0edd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 9 Jan 2025 00:10:15 +0700 Subject: [PATCH 5/6] Update lcs.py --- pythainlp/util/lcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/lcs.py b/pythainlp/util/lcs.py index 781c26950..104f5674c 100644 --- a/pythainlp/util/lcs.py +++ b/pythainlp/util/lcs.py @@ -18,7 +18,7 @@ def longest_common_subsequence(str1: str, str2: str) -> str: from pythainlp.util.lcs import longest_common_subsequence print(longest_common_subsequence("ABCBDAB", "BDCAB")) - # output: "BCAB" + # output: "BDAB" """ m = len(str1) n = len(str2) From 4c5c9484f32bafc2d573fabe4c36d10198ddbb75 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Thu, 9 Jan 2025 00:15:00 +0700 Subject: [PATCH 6/6] Add longest_common_subsequence docs --- docs/api/util.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/api/util.rst b/docs/api/util.rst index 76452e12c..1979e44ef 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -283,6 +283,11 @@ Modules The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner. +.. autofunction:: longest_common_subsequence + :noindex: + + The `longest_common_subsequence` function is find the longest common subsequence between two strings. + .. autofunction:: pythainlp.util.morse.morse_encode :noindex: