From 6e8d81987f44d3b1c83374303149bd57be316123 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 8 Jul 2024 12:00:04 -0400 Subject: [PATCH] Improve bitap algorithm with obvious shortcuts This patches python3 diff_match_patch.py to take two shortcuts in bitap score calculation. First, it does an integer comparison before forcing a cast to float, and second, if it gets a score of 0.0, it returns it. All relevant tests pass, as I would expect, because one can demonstrate that this works just by reading the code carefully. I know that this patch could be transfered to the C# version as well, I suspect that all versions have this inefficiency. --- python3/diff_match_patch.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index 3bf825c5..70425ade 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1274,6 +1274,8 @@ def match_bitapScore(e, x): Returns: Overall score for match (0.0 = good, 1.0 = bad). """ + if (e == 0 and x == loc): + return 0.0 accuracy = float(e) / len(pattern) proximity = abs(loc - x) if not self.Match_Distance: @@ -1287,10 +1289,14 @@ def match_bitapScore(e, x): best_loc = text.find(pattern, loc) if best_loc != -1: score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + if score_threshold == 0.0: # Can't improve this + return best_loc # What about in the other direction? (speedup) best_loc = text.rfind(pattern, loc + len(pattern)) if best_loc != -1: score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + if score_threshold == 0.0: + return best_loc # Initialise the bit arrays. matchmask = 1 << (len(pattern) - 1)