From 5a0c712a8818ac7b258aa3ed5df9c81933c65c6f Mon Sep 17 00:00:00 2001
From: soelderer
Date: Sat, 27 Sep 2025 19:57:03 +0200
Subject: [PATCH 1/2] TocHeaders: strip byte order mark
---
pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index ad6ca0c1..88d678d2 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -206,6 +206,7 @@ def get_header_id(self, span: dict, page=None) -> str:
text = span["text"].strip() # remove leading and trailing whitespace
for t in my_toc:
title = t[1].strip() # title of TOC entry
+ title = title.lstrip("\ufeff") # remove byte order mark if any
lvl = t[0] # level of TOC entry
if text.startswith(title) or title.startswith(text):
# found a match: return the header tag
From e8fa583720a85360dfed556840086abbe4bbf22d Mon Sep 17 00:00:00 2001
From: soelderer
Date: Thu, 2 Oct 2025 08:27:37 +0200
Subject: [PATCH 2/2] TocHeaders: replace non-breaking spaces
---
pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index 88d678d2..6de8685f 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -207,6 +207,7 @@ def get_header_id(self, span: dict, page=None) -> str:
for t in my_toc:
title = t[1].strip() # title of TOC entry
title = title.lstrip("\ufeff") # remove byte order mark if any
+ title = title.replace("\xa0", " ") # replace non-breaking spaces
lvl = t[0] # level of TOC entry
if text.startswith(title) or title.startswith(text):
# found a match: return the header tag