Skip to content

Commit 9682a6e

Browse files
committed
Improve DOIResolution
- remove similarity of links - add citation meta tag <meta name="citation_pdf_url">
1 parent ca98634 commit 9682a6e

File tree

2 files changed

+34
-40
lines changed

2 files changed

+34
-40
lines changed

src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package org.jabref.logic.importer.fetcher;
22

33
import java.io.IOException;
4+
import java.net.MalformedURLException;
45
import java.net.URL;
56
import java.util.ArrayList;
6-
import java.util.Arrays;
77
import java.util.List;
88
import java.util.Locale;
99
import java.util.Objects;
@@ -12,7 +12,6 @@
1212

1313
import org.jabref.logic.importer.FulltextFetcher;
1414
import org.jabref.logic.net.URLDownload;
15-
import org.jabref.logic.util.strings.StringSimilarity;
1615
import org.jabref.model.entry.BibEntry;
1716
import org.jabref.model.entry.field.StandardField;
1817
import org.jabref.model.entry.identifier.DOI;
@@ -32,14 +31,6 @@
3231
public class DoiResolution implements FulltextFetcher {
3332
private static final Logger LOGGER = LoggerFactory.getLogger(DoiResolution.class);
3433

35-
/**
36-
* A DOI leads to the page of the paper. We assume that there is the PDF of the paper.
37-
* Thus, the DOI fetcher is ranked highest (see {@link TrustLevel}).
38-
* Some publishers do not publish the PDF of the paper, but the full proceedings on the page
39-
* of the paper. For these publishers, we need to skip the DOI search.
40-
*/
41-
private final List<String> excludedHosts = Arrays.asList("link.springer.com", "ieeexplore.ieee.org");
42-
4334
@Override
4435
public Optional<URL> findFullText(BibEntry entry) throws IOException {
4536
Objects.requireNonNull(entry);
@@ -67,11 +58,14 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
6758
connection.timeout(10000);
6859

6960
Connection.Response response = connection.execute();
70-
if (excludedHosts.contains(response.url().getHost())) {
71-
return Optional.empty();
72-
}
7361

7462
Document html = response.parse();
63+
// citation pdf meta tag
64+
Optional<URL> citationMetaTag = citationMetaTag(html);
65+
if (citationMetaTag.isPresent()) {
66+
return citationMetaTag;
67+
}
68+
7569
// scan for PDF
7670
Elements hrefElements = html.body().select("a[href]");
7771

@@ -94,11 +88,11 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
9488

9589
// return if only one link was found (high accuracy)
9690
if (links.size() == 1) {
97-
LOGGER.info("Fulltext PDF found @ " + doiLink);
91+
LOGGER.info("Fulltext PDF found @ {}", doiLink);
9892
return Optional.of(links.get(0));
9993
}
100-
// return if links are similar or multiple links are similar
101-
return findSimilarLinks(links);
94+
// return if links are equal
95+
return findDistinctLinks(links);
10296
} catch (UnsupportedMimeTypeException type) {
10397
// this might be the PDF already as we follow redirects
10498
if (type.getMimeType().startsWith("application/pdf")) {
@@ -112,7 +106,25 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
112106
return Optional.empty();
113107
}
114108

115-
private Optional<URL> findSimilarLinks(List<URL> urls) {
109+
/**
110+
* Scan for <meta name="citation_pdf_url">
111+
* See https://scholar.google.com/intl/de/scholar/inclusion.html#indexing
112+
*/
113+
private Optional<URL> citationMetaTag(Document html) {
114+
Elements citationPdfUrlElement = html.head().select("meta[name='citation_pdf_url']");
115+
Optional<String> citationPdfUrl = citationPdfUrlElement.stream().map(e -> e.attr("content")).findFirst();
116+
117+
if (citationPdfUrl.isPresent()) {
118+
try {
119+
return Optional.of(new URL(citationPdfUrl.get()));
120+
} catch (MalformedURLException e) {
121+
return Optional.empty();
122+
}
123+
}
124+
return Optional.empty();
125+
}
126+
127+
private Optional<URL> findDistinctLinks(List<URL> urls) {
116128
List<URL> distinctLinks = urls.stream().distinct().collect(Collectors.toList());
117129

118130
if (distinctLinks.isEmpty()) {
@@ -122,13 +134,6 @@ private Optional<URL> findSimilarLinks(List<URL> urls) {
122134
if (distinctLinks.size() == 1) {
123135
return Optional.of(distinctLinks.get(0));
124136
}
125-
// similar
126-
final String firstElement = distinctLinks.get(0).toString();
127-
StringSimilarity similarity = new StringSimilarity();
128-
List<URL> similarLinks = distinctLinks.stream().filter(elem -> similarity.isSimilar(firstElement, elem.toString())).collect(Collectors.toList());
129-
if (similarLinks.size() == distinctLinks.size()) {
130-
return Optional.of(similarLinks.get(0));
131-
}
132137

133138
return Optional.empty();
134139
}

src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,14 @@ void linkWithPdfInTitleTag() throws IOException {
3838
@Test
3939
void linkWithPdfStringLeadsToFulltext() throws IOException {
4040
entry.setField(StandardField.DOI, "10.1002/acr2.11101");
41-
assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
41+
assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/pdf/10.1002/acr2.11101")), finder.findFullText(entry));
4242
}
4343

44+
4445
@Test
45-
void multipleLinksWithSmallEditDistanceLeadToFulltext() throws IOException {
46-
entry.setField(StandardField.DOI, "10.1002/acr2.11101");
47-
assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
46+
void citationMetaTagLeadsToFulltext() throws IOException {
47+
entry.setField(StandardField.DOI, "10.1007/978-3-319-89963-3_28");
48+
assertEquals(Optional.of(new URL("https://link.springer.com/content/pdf/10.1007%2F978-3-319-89963-3_28.pdf")), finder.findFullText(entry));
4849
}
4950

5051
@Test
@@ -53,18 +54,6 @@ void notReturnAnythingWhenMultipleLinksAreFound() throws IOException {
5354
assertEquals(Optional.empty(), finder.findFullText(entry));
5455
}
5556

56-
@Test
57-
void notReturnAnythingWhenDOILeadsToSpringerLink() throws IOException {
58-
entry.setField(StandardField.DOI, "https://doi.org/10.1007/978-3-319-89963-3_28");
59-
assertEquals(Optional.empty(), finder.findFullText(entry));
60-
}
61-
62-
@Test
63-
void notReturnAnythingWhenDOILeadsToIEEE() throws IOException {
64-
entry.setField(StandardField.DOI, "https://doi.org/10.1109/TTS.2020.2992669");
65-
assertEquals(Optional.empty(), finder.findFullText(entry));
66-
}
67-
6857
@Test
6958
void notFoundByDOI() throws IOException {
7059
entry.setField(StandardField.DOI, "10.1186/unknown-doi");

0 commit comments

Comments
 (0)