diff --git a/docs/api/util.rst b/docs/api/util.rst index 76452e12c..1979e44ef 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -283,6 +283,11 @@ Modules The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner. +.. autofunction:: longest_common_subsequence + :noindex: + + The `longest_common_subsequence` function is find the longest common subsequence between two strings. + .. autofunction:: pythainlp.util.morse.morse_encode :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 9939ac089..f101ffe36 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 @@ -26,6 +26,7 @@ "is_native_thai", "isthai", "isthaichar", + "longest_common_subsequence", "nectec_to_ipa", "normalize", "now_reign_year", @@ -92,6 +93,7 @@ thai_to_eng, ) from pythainlp.util.keywords import find_keyword, rank +from pythainlp.util.lcs import longest_common_subsequence from pythainlp.util.normalize import ( maiyamok, normalize, diff --git a/pythainlp/util/lcs.py b/pythainlp/util/lcs.py new file mode 100644 index 000000000..104f5674c --- /dev/null +++ b/pythainlp/util/lcs.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +def longest_common_subsequence(str1: str, str2: str) -> str: + """ + Find the longest common subsequence between two strings. + + :param str str1: The first string. + :param str str2: The second string. + :return: The longest common subsequence. + :rtype: str + + :Example: + :: + + from pythainlp.util.lcs import longest_common_subsequence + + print(longest_common_subsequence("ABCBDAB", "BDCAB")) + # output: "BDAB" + """ + m = len(str1) + n = len(str2) + + # Create a 2D array to store lengths of longest common subsequence. + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Build the dp array from bottom up. + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif str1[i - 1] == str2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + # Following code is used to print LCS + index = dp[m][n] + + # Create a character array to store the lcs string + lcs = [""] * (index + 1) + lcs[index] = "" + + # Start from the right-most-bottom-most corner and + # one by one store characters in lcs[] + i = m + j = n + while i > 0 and j > 0: + + # If current character in str1 and str2 are same, then + # current character is part of LCS + if str1[i - 1] == str2[j - 1]: + lcs[index - 1] = str1[i - 1] + i -= 1 + j -= 1 + index -= 1 + + # If not same, then find the larger of two and + # go in the direction of larger value + elif dp[i - 1][j] > dp[i][j - 1]: + i -= 1 + else: + j -= 1 + + return "".join(lcs) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index f0a8e31ba..5b8b12a91 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -32,6 +32,7 @@ ipa_to_rtgs, isthai, isthaichar, + longest_common_subsequence, nectec_to_ipa, normalize, now_reign_year, @@ -842,3 +843,13 @@ def test_th_zodiac(self): # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) + + def test_longest_common_subsequence(self): + self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB") + self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB") + self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH") + self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC") + self.assertEqual(longest_common_subsequence("ABC", "DEF"), "") + self.assertEqual(longest_common_subsequence("", "ABC"), "") + self.assertEqual(longest_common_subsequence("ABC", ""), "") + self.assertEqual(longest_common_subsequence("", ""), "")