diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index ad6ca0c1..6de8685f 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -206,6 +206,8 @@ def get_header_id(self, span: dict, page=None) -> str: text = span["text"].strip() # remove leading and trailing whitespace for t in my_toc: title = t[1].strip() # title of TOC entry + title = title.lstrip("\ufeff") # remove byte order mark if any + title = title.replace("\xa0", " ") # replace non-breaking spaces lvl = t[0] # level of TOC entry if text.startswith(title) or title.startswith(text): # found a match: return the header tag