From 5a0c712a8818ac7b258aa3ed5df9c81933c65c6f Mon Sep 17 00:00:00 2001 From: soelderer Date: Sat, 27 Sep 2025 19:57:03 +0200 Subject: [PATCH 1/2] TocHeaders: strip byte order mark --- pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index ad6ca0c1..88d678d2 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -206,6 +206,7 @@ def get_header_id(self, span: dict, page=None) -> str: text = span["text"].strip() # remove leading and trailing whitespace for t in my_toc: title = t[1].strip() # title of TOC entry + title = title.lstrip("\ufeff") # remove byte order mark if any lvl = t[0] # level of TOC entry if text.startswith(title) or title.startswith(text): # found a match: return the header tag From e8fa583720a85360dfed556840086abbe4bbf22d Mon Sep 17 00:00:00 2001 From: soelderer Date: Thu, 2 Oct 2025 08:27:37 +0200 Subject: [PATCH 2/2] TocHeaders: replace non-breaking spaces --- pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 88d678d2..6de8685f 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -207,6 +207,7 @@ def get_header_id(self, span: dict, page=None) -> str: for t in my_toc: title = t[1].strip() # title of TOC entry title = title.lstrip("\ufeff") # remove byte order mark if any + title = title.replace("\xa0", " ") # replace non-breaking spaces lvl = t[0] # level of TOC entry if text.startswith(title) or title.startswith(text): # found a match: return the header tag