11package org .jabref .logic .importer .fetcher ;
22
33import java .io .IOException ;
4+ import java .net .MalformedURLException ;
45import java .net .URL ;
56import java .util .ArrayList ;
6- import java .util .Arrays ;
77import java .util .List ;
88import java .util .Locale ;
99import java .util .Objects ;
1212
1313import org .jabref .logic .importer .FulltextFetcher ;
1414import org .jabref .logic .net .URLDownload ;
15- import org .jabref .logic .util .strings .StringSimilarity ;
1615import org .jabref .model .entry .BibEntry ;
1716import org .jabref .model .entry .field .StandardField ;
1817import org .jabref .model .entry .identifier .DOI ;
3231public class DoiResolution implements FulltextFetcher {
3332 private static final Logger LOGGER = LoggerFactory .getLogger (DoiResolution .class );
3433
35- /**
36- * A DOI leads to the page of the paper. We assume that there is the PDF of the paper.
37- * Thus, the DOI fetcher is ranked highest (see {@link TrustLevel}).
38- * Some publishers do not publish the PDF of the paper, but the full proceedings on the page
39- * of the paper. For these publishers, we need to skip the DOI search.
40- */
41- private final List <String > excludedHosts = Arrays .asList ("link.springer.com" , "ieeexplore.ieee.org" );
42-
4334 @ Override
4435 public Optional <URL > findFullText (BibEntry entry ) throws IOException {
4536 Objects .requireNonNull (entry );
@@ -67,11 +58,14 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
6758 connection .timeout (10000 );
6859
6960 Connection .Response response = connection .execute ();
70- if (excludedHosts .contains (response .url ().getHost ())) {
71- return Optional .empty ();
72- }
7361
7462 Document html = response .parse ();
63+ // citation pdf meta tag
64+ Optional <URL > citationMetaTag = citationMetaTag (html );
65+ if (citationMetaTag .isPresent ()) {
66+ return citationMetaTag ;
67+ }
68+
7569 // scan for PDF
7670 Elements hrefElements = html .body ().select ("a[href]" );
7771
@@ -94,11 +88,11 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
9488
9589 // return if only one link was found (high accuracy)
9690 if (links .size () == 1 ) {
97- LOGGER .info ("Fulltext PDF found @ " + doiLink );
91+ LOGGER .info ("Fulltext PDF found @ {}" , doiLink );
9892 return Optional .of (links .get (0 ));
9993 }
100- // return if links are similar or multiple links are similar
101- return findSimilarLinks (links );
94+ // return if links are equal
95+ return findDistinctLinks (links );
10296 } catch (UnsupportedMimeTypeException type ) {
10397 // this might be the PDF already as we follow redirects
10498 if (type .getMimeType ().startsWith ("application/pdf" )) {
@@ -112,7 +106,25 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
112106 return Optional .empty ();
113107 }
114108
115- private Optional <URL > findSimilarLinks (List <URL > urls ) {
109+ /**
110+ * Scan for <meta name="citation_pdf_url">
111+ * See https://scholar.google.com/intl/de/scholar/inclusion.html#indexing
112+ */
113+ private Optional <URL > citationMetaTag (Document html ) {
114+ Elements citationPdfUrlElement = html .head ().select ("meta[name='citation_pdf_url']" );
115+ Optional <String > citationPdfUrl = citationPdfUrlElement .stream ().map (e -> e .attr ("content" )).findFirst ();
116+
117+ if (citationPdfUrl .isPresent ()) {
118+ try {
119+ return Optional .of (new URL (citationPdfUrl .get ()));
120+ } catch (MalformedURLException e ) {
121+ return Optional .empty ();
122+ }
123+ }
124+ return Optional .empty ();
125+ }
126+
127+ private Optional <URL > findDistinctLinks (List <URL > urls ) {
116128 List <URL > distinctLinks = urls .stream ().distinct ().collect (Collectors .toList ());
117129
118130 if (distinctLinks .isEmpty ()) {
@@ -122,13 +134,6 @@ private Optional<URL> findSimilarLinks(List<URL> urls) {
122134 if (distinctLinks .size () == 1 ) {
123135 return Optional .of (distinctLinks .get (0 ));
124136 }
125- // similar
126- final String firstElement = distinctLinks .get (0 ).toString ();
127- StringSimilarity similarity = new StringSimilarity ();
128- List <URL > similarLinks = distinctLinks .stream ().filter (elem -> similarity .isSimilar (firstElement , elem .toString ())).collect (Collectors .toList ());
129- if (similarLinks .size () == distinctLinks .size ()) {
130- return Optional .of (similarLinks .get (0 ));
131- }
132137
133138 return Optional .empty ();
134139 }
0 commit comments