Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,11 @@ Modules

The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.

.. autofunction:: longest_common_subsequence
:noindex:

The `longest_common_subsequence` function is find the longest common subsequence between two strings.

.. autofunction:: pythainlp.util.morse.morse_encode
:noindex:

Expand Down
4 changes: 3 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
Expand Down Expand Up @@ -26,6 +26,7 @@
"is_native_thai",
"isthai",
"isthaichar",
"longest_common_subsequence",
"nectec_to_ipa",
"normalize",
"now_reign_year",
Expand Down Expand Up @@ -92,6 +93,7 @@
thai_to_eng,
)
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.lcs import longest_common_subsequence
from pythainlp.util.normalize import (
maiyamok,
normalize,
Expand Down
67 changes: 67 additions & 0 deletions pythainlp/util/lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

def longest_common_subsequence(str1: str, str2: str) -> str:
"""
Find the longest common subsequence between two strings.

:param str str1: The first string.
:param str str2: The second string.
:return: The longest common subsequence.
:rtype: str

:Example:
::

from pythainlp.util.lcs import longest_common_subsequence

print(longest_common_subsequence("ABCBDAB", "BDCAB"))
# output: "BDAB"
"""
m = len(str1)
n = len(str2)

# Create a 2D array to store lengths of longest common subsequence.
dp = [[0] * (n + 1) for _ in range(m + 1)]

# Build the dp array from bottom up.
for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
dp[i][j] = 0
elif str1[i - 1] == str2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

# Following code is used to print LCS
index = dp[m][n]

# Create a character array to store the lcs string
lcs = [""] * (index + 1)
lcs[index] = ""

# Start from the right-most-bottom-most corner and
# one by one store characters in lcs[]
i = m
j = n
while i > 0 and j > 0:

# If current character in str1 and str2 are same, then
# current character is part of LCS
if str1[i - 1] == str2[j - 1]:
lcs[index - 1] = str1[i - 1]
i -= 1
j -= 1
index -= 1

# If not same, then find the larger of two and
# go in the direction of larger value
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1

return "".join(lcs)
11 changes: 11 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
ipa_to_rtgs,
isthai,
isthaichar,
longest_common_subsequence,
nectec_to_ipa,
normalize,
now_reign_year,
Expand Down Expand Up @@ -842,3 +843,13 @@ def test_th_zodiac(self):

# def test_abbreviation_to_full_text(self):
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

def test_longest_common_subsequence(self):
self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")
self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH")
self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC")
self.assertEqual(longest_common_subsequence("ABC", "DEF"), "")
self.assertEqual(longest_common_subsequence("", "ABC"), "")
self.assertEqual(longest_common_subsequence("ABC", ""), "")
self.assertEqual(longest_common_subsequence("", ""), "")
Loading