Improve DOIResolution

stefan-kolb · stefan-kolb · commit 9682a6e98bb8 · 2021-01-10T01:18:23.000+01:00
- remove similarity of links
- add citation meta tag &lt;meta name="citation_pdf_url"&gt;
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java b/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java
@@ -1,9 +1,9 @@
 package org.jabref.logic.importer.fetcher;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
@@ -12,7 +12,6 @@
 
 import org.jabref.logic.importer.FulltextFetcher;
 import org.jabref.logic.net.URLDownload;
-import org.jabref.logic.util.strings.StringSimilarity;
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.identifier.DOI;
@@ -32,14 +31,6 @@
 public class DoiResolution implements FulltextFetcher {
     private static final Logger LOGGER = LoggerFactory.getLogger(DoiResolution.class);
 
-    /**
-     * A DOI leads to the page of the paper. We assume that there is the PDF of the paper.
-     * Thus, the DOI fetcher is ranked highest (see {@link TrustLevel}).
-     * Some publishers do not publish the PDF of the paper, but the full proceedings on the page
-     * of the paper. For these publishers, we need to skip the DOI search.
-     */
-    private final List<String> excludedHosts = Arrays.asList("link.springer.com", "ieeexplore.ieee.org");
-
     @Override
     public Optional<URL> findFullText(BibEntry entry) throws IOException {
         Objects.requireNonNull(entry);
@@ -67,11 +58,14 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
             connection.timeout(10000);
 
             Connection.Response response = connection.execute();
-            if (excludedHosts.contains(response.url().getHost())) {
-                return Optional.empty();
-            }
 
             Document html = response.parse();
+            // citation pdf meta tag
+            Optional<URL> citationMetaTag = citationMetaTag(html);
+            if (citationMetaTag.isPresent()) {
+                return citationMetaTag;
+            }
+
             // scan for PDF
             Elements hrefElements = html.body().select("a[href]");
 
@@ -94,11 +88,11 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
 
             // return if only one link was found (high accuracy)
             if (links.size() == 1) {
-                LOGGER.info("Fulltext PDF found @ " + doiLink);
+                LOGGER.info("Fulltext PDF found @ {}", doiLink);
                 return Optional.of(links.get(0));
             }
-            // return if links are similar or multiple links are similar
-            return findSimilarLinks(links);
+            // return if links are equal
+            return findDistinctLinks(links);
         } catch (UnsupportedMimeTypeException type) {
             // this might be the PDF already as we follow redirects
             if (type.getMimeType().startsWith("application/pdf")) {
@@ -112,7 +106,25 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
         return Optional.empty();
     }
 
-    private Optional<URL> findSimilarLinks(List<URL> urls) {
+    /**
+     * Scan for <meta name="citation_pdf_url">
+     * See https://scholar.google.com/intl/de/scholar/inclusion.html#indexing
+     */
+    private Optional<URL> citationMetaTag(Document html) {
+        Elements citationPdfUrlElement = html.head().select("meta[name='citation_pdf_url']");
+        Optional<String> citationPdfUrl = citationPdfUrlElement.stream().map(e -> e.attr("content")).findFirst();
+
+        if (citationPdfUrl.isPresent()) {
+            try {
+                return Optional.of(new URL(citationPdfUrl.get()));
+            } catch (MalformedURLException e) {
+                return Optional.empty();
+            }
+        }
+        return Optional.empty();
+    }
+
+    private Optional<URL> findDistinctLinks(List<URL> urls) {
         List<URL> distinctLinks = urls.stream().distinct().collect(Collectors.toList());
 
         if (distinctLinks.isEmpty()) {
@@ -122,13 +134,6 @@ private Optional<URL> findSimilarLinks(List<URL> urls) {
         if (distinctLinks.size() == 1) {
             return Optional.of(distinctLinks.get(0));
         }
-        // similar
-        final String firstElement = distinctLinks.get(0).toString();
-        StringSimilarity similarity = new StringSimilarity();
-        List<URL> similarLinks = distinctLinks.stream().filter(elem -> similarity.isSimilar(firstElement, elem.toString())).collect(Collectors.toList());
-        if (similarLinks.size() == distinctLinks.size()) {
-            return Optional.of(similarLinks.get(0));
-        }
 
         return Optional.empty();
     }
diff --git a/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java b/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java
@@ -38,13 +38,14 @@ void linkWithPdfInTitleTag() throws IOException {
     @Test
     void linkWithPdfStringLeadsToFulltext() throws IOException {
         entry.setField(StandardField.DOI, "10.1002/acr2.11101");
-        assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
+        assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/pdf/10.1002/acr2.11101")), finder.findFullText(entry));
     }
 
+
     @Test
-    void multipleLinksWithSmallEditDistanceLeadToFulltext() throws IOException {
-        entry.setField(StandardField.DOI, "10.1002/acr2.11101");
-        assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
+    void citationMetaTagLeadsToFulltext() throws IOException {
+        entry.setField(StandardField.DOI, "10.1007/978-3-319-89963-3_28");
+        assertEquals(Optional.of(new URL("https://link.springer.com/content/pdf/10.1007%2F978-3-319-89963-3_28.pdf")), finder.findFullText(entry));
     }
 
     @Test
@@ -53,18 +54,6 @@ void notReturnAnythingWhenMultipleLinksAreFound() throws IOException {
         assertEquals(Optional.empty(), finder.findFullText(entry));
     }
 
-    @Test
-    void notReturnAnythingWhenDOILeadsToSpringerLink() throws IOException {
-        entry.setField(StandardField.DOI, "https://doi.org/10.1007/978-3-319-89963-3_28");
-        assertEquals(Optional.empty(), finder.findFullText(entry));
-    }
-
-    @Test
-    void notReturnAnythingWhenDOILeadsToIEEE() throws IOException {
-        entry.setField(StandardField.DOI, "https://doi.org/10.1109/TTS.2020.2992669");
-        assertEquals(Optional.empty(), finder.findFullText(entry));
-    }
-
     @Test
     void notFoundByDOI() throws IOException {
         entry.setField(StandardField.DOI, "10.1186/unknown-doi");