diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java index dd8868c3082..3ebda1c0bbd 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java @@ -1,9 +1,11 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.logic.net.URLDownload; @@ -21,78 +23,102 @@ import kong.unirest.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. See https://dev.elsevier.com/ + * FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. + * See https://dev.elsevier.com/. */ public class ScienceDirect implements FulltextFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(ScienceDirect.class); - private static final String API_URL = "http://api.elsevier.com/content/article/doi/"; + private static final String API_URL = "https://api.elsevier.com/content/article/doi/"; private static final String API_KEY = new BuildInfo().scienceDirectApiKey; @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - // Try unique DOI first Optional doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); + if (doi.isEmpty()) { + // Full text fetching works only if a DOI is present + return Optional.empty(); + } - if (doi.isPresent()) { - // Available in catalog? - try { - String sciLink = getUrlByDoi(doi.get().getDOI()); - - // scrape the web page not as mobile client! - if (!sciLink.isEmpty()) { - Document html = Jsoup.connect(sciLink) - .userAgent(URLDownload.USER_AGENT) - .referrer("http://www.google.com") - .ignoreHttpErrors(true).get(); - - // Retrieve PDF link from meta data (most recent) - Elements metaLinks = html.getElementsByAttributeValue("name", "citation_pdf_url"); - - if (!metaLinks.isEmpty()) { - String link = metaLinks.first().attr("content"); - return Optional.of(new URL(link)); - } - - URL url = new URL(sciLink); - String protocol = url.getProtocol(); - String authority = url.getAuthority(); - - Optional fullLinkToPdf = html - .getElementsByAttributeValue("type", "application/json") - .stream() - .flatMap(element -> element.getElementsByTag("script").stream()) - // get the text element - .map(element -> element.childNode(0)) - .map(element -> element.toString()) - .map(text -> new JSONObject(text)) - .filter(json -> json.has("article")) - .map(json -> json.getJSONObject("article")) - .filter(json -> json.has("pdfDownload")) - .map(json -> json.getJSONObject("pdfDownload")) - .filter(json -> json.has("linkToPdf")) - .map(json -> json.getString("linkToPdf")) - .map(linkToPdf -> String.format("%s://%s%s", protocol, authority, linkToPdf)) - .findAny(); - if (fullLinkToPdf.isPresent()) { - LOGGER.info("Fulltext PDF found at ScienceDirect."); - // new URL may through "MalformedURLException", thus using "isPresent()" above and ".get()" - Optional pdfLink = Optional.of(new URL(fullLinkToPdf.get())); - return pdfLink; - } - } - } catch (UnirestException e) { - LOGGER.warn("ScienceDirect API request failed", e); - } + String urlFromDoi = getUrlByDoi(doi.get().getDOI()); + if (urlFromDoi.isEmpty()) { + return Optional.empty(); + } + // Scrape the web page as desktop client (not as mobile client!) + Document html = Jsoup.connect(urlFromDoi) + .userAgent(URLDownload.USER_AGENT) + .referrer("https://www.google.com") + .ignoreHttpErrors(true) + .get(); + + // Retrieve PDF link from meta data (most recent) + Elements metaLinks = html.getElementsByAttributeValue("name", "citation_pdf_url"); + if (!metaLinks.isEmpty()) { + String link = metaLinks.first().attr("content"); + return Optional.of(new URL(link)); + } + + // We use the ScienceDirect web page which contains the article (presented using HTML). + // This page contains the link to the PDF in some JavaScript code embedded in the web page. + // Example page: https://www.sciencedirect.com/science/article/pii/S1674775515001079 + + Optional pdfDownloadOptional = html + .getElementsByAttributeValue("type", "application/json") + .stream() + .flatMap(element -> element.getElementsByTag("script").stream()) + // The first DOM child of the script element is the script itself (represented as HTML text) + .map(element -> element.childNode(0)) + .map(Node::toString) + .map(JSONObject::new) + .filter(json -> json.has("article")) + .map(json -> json.getJSONObject("article")) + .filter(json -> json.has("pdfDownload")) + .map(json -> json.getJSONObject("pdfDownload")) + .findAny(); + + if (pdfDownloadOptional.isEmpty()) { + LOGGER.debug("No 'pdfDownload' key found in JSON information"); + return Optional.empty(); + } + + JSONObject pdfDownload = pdfDownloadOptional.get(); + + String fullLinkToPdf; + if (pdfDownload.has("linkToPdf")) { + String linkToPdf = pdfDownload.getString("linkToPdf"); + URL url = new URL(urlFromDoi); + fullLinkToPdf = String.format("%s://%s%s", url.getProtocol(), url.getAuthority(), linkToPdf); + } else if (pdfDownload.has("urlMetadata")) { + JSONObject urlMetadata = pdfDownload.getJSONObject("urlMetadata"); + JSONObject queryParamsObject = urlMetadata.getJSONObject("queryParams"); + String queryParameters = queryParamsObject.keySet().stream() + .map(key -> String.format("%s=%s", key, queryParamsObject.getString(key))) + .collect(Collectors.joining("&")); + fullLinkToPdf = String.format("https://www.sciencedirect.com/%s/%s%s?%s", + urlMetadata.getString("path"), + urlMetadata.getString("pii"), + urlMetadata.getString("pdfExtension"), + queryParameters); + } else { + LOGGER.debug("No suitable data in JSON information"); + return Optional.empty(); + } + + LOGGER.info("Fulltext PDF found at ScienceDirect at {}.", fullLinkToPdf); + try { + return Optional.of(new URL(fullLinkToPdf)); + } catch (MalformedURLException e) { + LOGGER.error("malformed URL", e); + return Optional.empty(); } - return Optional.empty(); } @Override @@ -110,7 +136,9 @@ private String getUrlByDoi(String doi) throws UnirestException { .asJson(); JSONObject json = jsonResponse.getBody().getObject(); - JSONArray links = json.getJSONObject("full-text-retrieval-response").getJSONObject("coredata").getJSONArray("link"); + JSONArray links = json.getJSONObject("full-text-retrieval-response") + .getJSONObject("coredata") + .getJSONArray("link"); for (int i = 0; i < links.length(); i++) { JSONObject link = links.getJSONObject(i); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java index 95bcf8b6358..7c955863f24 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java @@ -28,7 +28,7 @@ void setUp() { @Test @DisabledOnCIServer("CI server is blocked") - void findByDOIOldPage() throws IOException { + void findByDoiOldPage() throws IOException { entry.setField(StandardField.DOI, "10.1016/j.jrmge.2015.08.004"); assertEquals( @@ -39,7 +39,7 @@ void findByDOIOldPage() throws IOException { @Test @DisabledOnCIServer("CI server is blocked") - void findByDOINewPage() throws IOException { + void findByDoiNewPage() throws IOException { entry.setField(StandardField.DOI, "10.1016/j.aasri.2014.09.002"); assertEquals( @@ -62,7 +62,7 @@ void findByDoiWorksForBoneArticle() throws IOException { @Test @DisabledOnCIServer("CI server is blocked") - void notFoundByDOI() throws IOException { + void notFoundByDoi() throws IOException { entry.setField(StandardField.DOI, "10.1016/j.aasri.2014.0559.002"); assertEquals(Optional.empty(), finder.findFullText(entry));