From d223837bde56347146ae942588ad5a7703ef511e Mon Sep 17 00:00:00 2001 From: kaijie-qin Date: Wed, 11 Dec 2024 16:17:01 +0800 Subject: [PATCH] u --- pymupdf4llm/pymupdf4llm/helpers/multi_column.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index 3c96bcb0..6a87b08c 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -93,13 +93,20 @@ def is_white(text): """Check for relevant text.""" return WHITE.issuperset(text) - def in_bbox(bb, bboxes): - """Return 1-based number if a bbox contains bb, else return 0.""" + def in_bbox(bb, bboxes, threshold=0.95): for i, bbox in enumerate(bboxes, start=1): - if bb in bbox: + if almost_in_bbox(bb, bbox, threshold): return i return 0 + def almost_in_bbox(bb, box, threshold): + intersect = bb & box + if intersect.is_empty: + return False + + ratio = intersect.get_area() / bb.get_area() + return ratio >= threshold + def intersects_bboxes(bb, bboxes): """Return True if a bbox touches bb, else return False.""" for bbox in bboxes: