Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 83 additions & 55 deletions src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.net.URLDownload;
Expand All @@ -21,78 +23,102 @@
import kong.unirest.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. See <a href="https://dev.elsevier.com/">https://dev.elsevier.com/</a>
* FulltextFetcher implementation that attempts to find a PDF URL at <a href="https://www.sciencedirect.com/">ScienceDirect</a>.
* See <a href="https://dev.elsevier.com/">https://dev.elsevier.com/</a>.
*/
public class ScienceDirect implements FulltextFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(ScienceDirect.class);

private static final String API_URL = "http://api.elsevier.com/content/article/doi/";
private static final String API_URL = "https://api.elsevier.com/content/article/doi/";
private static final String API_KEY = new BuildInfo().scienceDirectApiKey;

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);

// Try unique DOI first
Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse);
if (doi.isEmpty()) {
// Full text fetching works only if a DOI is present
return Optional.empty();
}

if (doi.isPresent()) {
// Available in catalog?
try {
String sciLink = getUrlByDoi(doi.get().getDOI());

// scrape the web page not as mobile client!
if (!sciLink.isEmpty()) {
Document html = Jsoup.connect(sciLink)
.userAgent(URLDownload.USER_AGENT)
.referrer("http://www.google.com")
.ignoreHttpErrors(true).get();

// Retrieve PDF link from meta data (most recent)
Elements metaLinks = html.getElementsByAttributeValue("name", "citation_pdf_url");

if (!metaLinks.isEmpty()) {
String link = metaLinks.first().attr("content");
return Optional.of(new URL(link));
}

URL url = new URL(sciLink);
String protocol = url.getProtocol();
String authority = url.getAuthority();

Optional<String> fullLinkToPdf = html
.getElementsByAttributeValue("type", "application/json")
.stream()
.flatMap(element -> element.getElementsByTag("script").stream())
// get the text element
.map(element -> element.childNode(0))
.map(element -> element.toString())
.map(text -> new JSONObject(text))
.filter(json -> json.has("article"))
.map(json -> json.getJSONObject("article"))
.filter(json -> json.has("pdfDownload"))
.map(json -> json.getJSONObject("pdfDownload"))
.filter(json -> json.has("linkToPdf"))
.map(json -> json.getString("linkToPdf"))
.map(linkToPdf -> String.format("%s://%s%s", protocol, authority, linkToPdf))
.findAny();
if (fullLinkToPdf.isPresent()) {
LOGGER.info("Fulltext PDF found at ScienceDirect.");
// new URL may through "MalformedURLException", thus using "isPresent()" above and ".get()"
Optional<URL> pdfLink = Optional.of(new URL(fullLinkToPdf.get()));
return pdfLink;
}
}
} catch (UnirestException e) {
LOGGER.warn("ScienceDirect API request failed", e);
}
String urlFromDoi = getUrlByDoi(doi.get().getDOI());
if (urlFromDoi.isEmpty()) {
return Optional.empty();
}
// Scrape the web page as desktop client (not as mobile client!)
Document html = Jsoup.connect(urlFromDoi)
.userAgent(URLDownload.USER_AGENT)
.referrer("https://www.google.com")
.ignoreHttpErrors(true)
.get();

// Retrieve PDF link from meta data (most recent)
Elements metaLinks = html.getElementsByAttributeValue("name", "citation_pdf_url");
if (!metaLinks.isEmpty()) {
String link = metaLinks.first().attr("content");
return Optional.of(new URL(link));
}

// We use the ScienceDirect web page which contains the article (presented using HTML).
// This page contains the link to the PDF in some JavaScript code embedded in the web page.
// Example page: https://www.sciencedirect.com/science/article/pii/S1674775515001079

Optional<JSONObject> pdfDownloadOptional = html
.getElementsByAttributeValue("type", "application/json")
.stream()
.flatMap(element -> element.getElementsByTag("script").stream())
// The first DOM child of the script element is the script itself (represented as HTML text)
.map(element -> element.childNode(0))
.map(Node::toString)
.map(JSONObject::new)
.filter(json -> json.has("article"))
.map(json -> json.getJSONObject("article"))
.filter(json -> json.has("pdfDownload"))
.map(json -> json.getJSONObject("pdfDownload"))
.findAny();

if (pdfDownloadOptional.isEmpty()) {
LOGGER.debug("No 'pdfDownload' key found in JSON information");
return Optional.empty();
}

JSONObject pdfDownload = pdfDownloadOptional.get();

String fullLinkToPdf;
if (pdfDownload.has("linkToPdf")) {
String linkToPdf = pdfDownload.getString("linkToPdf");
URL url = new URL(urlFromDoi);
fullLinkToPdf = String.format("%s://%s%s", url.getProtocol(), url.getAuthority(), linkToPdf);
} else if (pdfDownload.has("urlMetadata")) {
JSONObject urlMetadata = pdfDownload.getJSONObject("urlMetadata");
JSONObject queryParamsObject = urlMetadata.getJSONObject("queryParams");
String queryParameters = queryParamsObject.keySet().stream()
.map(key -> String.format("%s=%s", key, queryParamsObject.getString(key)))
.collect(Collectors.joining("&"));
fullLinkToPdf = String.format("https://www.sciencedirect.com/%s/%s%s?%s",
urlMetadata.getString("path"),
urlMetadata.getString("pii"),
urlMetadata.getString("pdfExtension"),
queryParameters);
} else {
LOGGER.debug("No suitable data in JSON information");
return Optional.empty();
}

LOGGER.info("Fulltext PDF found at ScienceDirect at {}.", fullLinkToPdf);
try {
return Optional.of(new URL(fullLinkToPdf));
} catch (MalformedURLException e) {
LOGGER.error("malformed URL", e);
return Optional.empty();
}
return Optional.empty();
}

@Override
Expand All @@ -110,7 +136,9 @@ private String getUrlByDoi(String doi) throws UnirestException {
.asJson();

JSONObject json = jsonResponse.getBody().getObject();
JSONArray links = json.getJSONObject("full-text-retrieval-response").getJSONObject("coredata").getJSONArray("link");
JSONArray links = json.getJSONObject("full-text-retrieval-response")
.getJSONObject("coredata")
.getJSONArray("link");

for (int i = 0; i < links.length(); i++) {
JSONObject link = links.getJSONObject(i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void setUp() {

@Test
@DisabledOnCIServer("CI server is blocked")
void findByDOIOldPage() throws IOException {
void findByDoiOldPage() throws IOException {
entry.setField(StandardField.DOI, "10.1016/j.jrmge.2015.08.004");

assertEquals(
Expand All @@ -39,7 +39,7 @@ void findByDOIOldPage() throws IOException {

@Test
@DisabledOnCIServer("CI server is blocked")
void findByDOINewPage() throws IOException {
void findByDoiNewPage() throws IOException {
entry.setField(StandardField.DOI, "10.1016/j.aasri.2014.09.002");

assertEquals(
Expand All @@ -62,7 +62,7 @@ void findByDoiWorksForBoneArticle() throws IOException {

@Test
@DisabledOnCIServer("CI server is blocked")
void notFoundByDOI() throws IOException {
void notFoundByDoi() throws IOException {
entry.setField(StandardField.DOI, "10.1016/j.aasri.2014.0559.002");

assertEquals(Optional.empty(), finder.findFullText(entry));
Expand Down