Skip to content

Commit 6446b25

Browse files
committed
Add parsing of PDF titles
This commit attempts to parse the title metadata in a PDF file if it exists, otherwise it fallbacks to printing only the media type and size. We use the `pdf` crate to parse all of the response body (for some reason metadata and table of contents are kept at the end of PDFs) and then ask the PDF for the title that may be defined in the "[info dictionary]". [info dictionary]: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
1 parent b4175be commit 6446b25

File tree

4 files changed

+244
-9
lines changed

4 files changed

+244
-9
lines changed

Cargo.lock

Lines changed: 217 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ stderrlog = "0.5.1"
5151
atty = "0.2.14"
5252
scraper = { version = "0.12.0", default-features = false, features = [] }
5353
phf = "0.7.24"
54+
pdf = "0.7.1"
5455

5556
[dependencies.image]
5657
version = "0.22.5"

src/http.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,18 @@ pub fn get_title(resp: &mut Response, rtd: &Rtd, dump: bool) -> Result<String, E
199199
trace!("[{}] {}", k, v.to_str().unwrap());
200200
});
201201

202+
if let Some(mime) = content_type.clone() {
203+
if mime == mime::APPLICATION_PDF {
204+
let mut body = Vec::new();
205+
if let Err(err) = resp.read_to_end(&mut body) {
206+
return Err(format_err!("failed to read pdf body: {}", err));
207+
}
208+
return get_pdf_metadata(&body)
209+
.or_else(|| get_mime(&rtd, &mime, &size))
210+
.ok_or(format_err!("{}: failed to parse title", resp.url()));
211+
}
212+
}
213+
202214
// vector to hold page content, which is progressively built from chunks of
203215
// downloaded data until a title is found (up to CHUNKS_MAX chunks)
204216
let mut body = Vec::new();

0 commit comments

Comments
 (0)