Skip to content

Commit 5ac788c

Browse files
Siedlerchrsubhramitkoppor
authored
fix arxiv html download redirect (#11797)
* fix arxiv html download redirect Fixes #4913 * Fix catch indents * Add redirect test case * Use Optionals * Class-global Unirest config * Manual handling of redirect * Fix conditions * Simplyfiy code * Use Wiremock instead of real endpoint * Improve test names * Fix condition * fix condition * wiremock with head request as well * use path insteadd of file * try with static unirest config * max retries * fix head * no body for head --------- Co-authored-by: Subhramit Basu Bhowmick <[email protected]> Co-authored-by: Oliver Kopp <[email protected]>
1 parent 9eed666 commit 5ac788c

File tree

6 files changed

+135
-51
lines changed

6 files changed

+135
-51
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
7373
- We fixed an exception when searching for unlinked files. [#11731](https://github.com/JabRef/jabref/issues/11731)
7474
- We fixed an issue where two contradicting notifications were shown when cutting an entry in the main table. [#11724](https://github.com/JabRef/jabref/pull/11724)
7575
- We fixed an issue where unescaped braces in the arXiv fetcher were not treated. [#11704](https://github.com/JabRef/jabref/issues/11704)
76+
- We fixed an issue where HTML instead of the fulltext pdf was downloaded when importing arXiv entries. [#4913](https://github.com/JabRef/jabref/issues/4913)
7677
- We fixed an issue where the keywords and crossref fields were not properly focused. [#11177](https://github.com/JabRef/jabref/issues/11177)
7778

7879
### Removed

build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,9 @@ dependencies {
368368
testImplementation "org.testfx:testfx-junit5:4.0.16-alpha"
369369
testImplementation "org.hamcrest:hamcrest-library:3.0"
370370

371+
// recommended by https://github.com/wiremock/wiremock/issues/2149#issuecomment-1835775954
372+
testImplementation 'org.wiremock:wiremock-standalone:3.3.1'
373+
371374
checkstyle 'com.puppycrawl.tools:checkstyle:10.18.1'
372375
// xjc needs the runtime as well for the ant task, otherwise it fails
373376
xjc group: 'org.glassfish.jaxb', name: 'jaxb-xjc', version: '3.0.2'

src/main/java/org/jabref/gui/linkedfile/DownloadLinkedFileAction.java

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -280,14 +280,11 @@ private Optional<ExternalFileType> inferFileType(URLDownload urlDownload) {
280280
}
281281

282282
private Optional<ExternalFileType> inferFileTypeFromMimeType(URLDownload urlDownload) {
283-
String mimeType = urlDownload.getMimeType();
284-
285-
if (mimeType != null) {
286-
LOGGER.debug("MIME Type suggested: {}", mimeType);
287-
return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences);
288-
} else {
289-
return Optional.empty();
290-
}
283+
return urlDownload.getMimeType()
284+
.flatMap(mimeType -> {
285+
LOGGER.debug("MIME Type suggested: {}", mimeType);
286+
return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences);
287+
});
291288
}
292289

293290
private Optional<ExternalFileType> inferFileTypeFromURL(String url) {

src/main/java/org/jabref/logic/net/URLDownload.java

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import java.util.List;
3030
import java.util.Map;
3131
import java.util.Map.Entry;
32+
import java.util.Optional;
3233

3334
import javax.net.ssl.HostnameVerifier;
3435
import javax.net.ssl.HttpsURLConnection;
@@ -39,7 +40,9 @@
3940
import org.jabref.logic.importer.FetcherException;
4041
import org.jabref.logic.importer.FetcherServerException;
4142
import org.jabref.logic.util.io.FileUtil;
43+
import org.jabref.model.strings.StringUtil;
4244

45+
import kong.unirest.core.HttpResponse;
4346
import kong.unirest.core.Unirest;
4447
import kong.unirest.core.UnirestException;
4548
import org.slf4j.Logger;
@@ -64,12 +67,20 @@ public class URLDownload {
6467
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36";
6568
private static final Logger LOGGER = LoggerFactory.getLogger(URLDownload.class);
6669
private static final Duration DEFAULT_CONNECT_TIMEOUT = Duration.ofSeconds(30);
70+
private static final int MAX_RETRIES = 3;
6771

6872
private final URL source;
6973
private final Map<String, String> parameters = new HashMap<>();
7074
private String postData = "";
7175
private Duration connectTimeout = DEFAULT_CONNECT_TIMEOUT;
7276

77+
static {
78+
Unirest.config()
79+
.followRedirects(true)
80+
.enableCookieManagement(true)
81+
.setDefaultHeader("User-Agent", USER_AGENT);
82+
}
83+
7384
/**
7485
* @param source the URL to download from
7586
* @throws MalformedURLException if no protocol is specified in the source, or an unknown protocol is found
@@ -103,15 +114,28 @@ public URL getSource() {
103114
return source;
104115
}
105116

106-
public String getMimeType() {
107-
Unirest.config().setDefaultHeader("User-Agent", "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6");
108-
117+
public Optional<String> getMimeType() {
109118
String contentType;
119+
120+
int retries = 0;
110121
// Try to use HEAD request to avoid downloading the whole file
111122
try {
112-
contentType = Unirest.head(source.toString()).asString().getHeaders().get("Content-Type").getFirst();
123+
String urlToCheck = source.toString();
124+
String locationHeader;
125+
do {
126+
retries++;
127+
HttpResponse<String> response = Unirest.head(urlToCheck).asString();
128+
// Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url
129+
// We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD
130+
locationHeader = response.getHeaders().getFirst("location");
131+
if (!StringUtil.isNullOrEmpty(locationHeader)) {
132+
urlToCheck = locationHeader;
133+
}
134+
// while loop, because there could be multiple redirects
135+
} while (!StringUtil.isNullOrEmpty(locationHeader) && retries <= MAX_RETRIES);
136+
contentType = Unirest.head(urlToCheck).asString().getHeaders().getFirst("Content-Type");
113137
if ((contentType != null) && !contentType.isEmpty()) {
114-
return contentType;
138+
return Optional.of(contentType);
115139
}
116140
} catch (Exception e) {
117141
LOGGER.debug("Error getting MIME type of URL via HEAD request", e);
@@ -120,8 +144,8 @@ public String getMimeType() {
120144
// Use GET request as alternative if no HEAD request is available
121145
try {
122146
contentType = Unirest.get(source.toString()).asString().getHeaders().get("Content-Type").getFirst();
123-
if ((contentType != null) && !contentType.isEmpty()) {
124-
return contentType;
147+
if (!StringUtil.isNullOrEmpty(contentType)) {
148+
return Optional.of(contentType);
125149
}
126150
} catch (Exception e) {
127151
LOGGER.debug("Error getting MIME type of URL via GET request", e);
@@ -130,16 +154,15 @@ public String getMimeType() {
130154
// Try to resolve local URIs
131155
try {
132156
URLConnection connection = new URL(source.toString()).openConnection();
133-
134157
contentType = connection.getContentType();
135-
if ((contentType != null) && !contentType.isEmpty()) {
136-
return contentType;
158+
if (!StringUtil.isNullOrEmpty(contentType)) {
159+
return Optional.of(contentType);
137160
}
138161
} catch (IOException e) {
139162
LOGGER.debug("Error trying to get MIME type of local URI", e);
140163
}
141164

142-
return "";
165+
return Optional.empty();
143166
}
144167

145168
/**
@@ -149,24 +172,13 @@ public String getMimeType() {
149172
* @return the status code of the response
150173
*/
151174
public boolean canBeReached() throws UnirestException {
152-
// new unirest version does not support apache http client any longer
153-
Unirest.config().reset()
154-
.followRedirects(true)
155-
.enableCookieManagement(true)
156-
.setDefaultHeader("User-Agent", USER_AGENT);
157175

158176
int statusCode = Unirest.head(source.toString()).asString().getStatus();
159177
return (statusCode >= 200) && (statusCode < 300);
160178
}
161179

162180
public boolean isMimeType(String type) {
163-
String mime = getMimeType();
164-
165-
if (mime.isEmpty()) {
166-
return false;
167-
}
168-
169-
return mime.startsWith(type);
181+
return getMimeType().map(mimeType -> mimeType.startsWith(type)).orElse(false);
170182
}
171183

172184
public boolean isPdf() {
@@ -333,7 +345,7 @@ private static void copy(InputStream in, Writer out, Charset encoding) throws IO
333345
/**
334346
* Open a connection to this object's URL (with specified settings).
335347
* <p>
336-
* If accessing an HTTP URL, remeber to close the resulting connection after usage.
348+
* If accessing an HTTP URL, remember to close the resulting connection after usage.
337349
*
338350
* @return an open connection
339351
*/
@@ -356,12 +368,14 @@ public URLConnection openConnection() throws FetcherException {
356368
}
357369

358370
if ((status == HttpURLConnection.HTTP_MOVED_TEMP)
359-
|| (status == HttpURLConnection.HTTP_MOVED_PERM)
360-
|| (status == HttpURLConnection.HTTP_SEE_OTHER)) {
371+
|| (status == HttpURLConnection.HTTP_MOVED_PERM)
372+
|| (status == HttpURLConnection.HTTP_SEE_OTHER)) {
361373
// get redirect url from "location" header field
362374
String newUrl = connection.getHeaderField("location");
363375
// open the new connection again
364376
try {
377+
httpURLConnection.disconnect();
378+
// multiple redirects are implemented by this recursion
365379
connection = new URLDownload(newUrl).openConnection();
366380
} catch (MalformedURLException e) {
367381
throw new FetcherException("Could not open URL Download", e);
@@ -370,9 +384,9 @@ public URLConnection openConnection() throws FetcherException {
370384
// in case of an error, propagate the error message
371385
SimpleHttpResponse httpResponse = new SimpleHttpResponse(httpURLConnection);
372386
LOGGER.info("{}", httpResponse);
373-
if ((status >= 400) && (status < 500)) {
387+
if (status < 500) {
374388
throw new FetcherClientException(this.source, httpResponse);
375-
} else if (status >= 500) {
389+
} else {
376390
throw new FetcherServerException(this.source, httpResponse);
377391
}
378392
}

src/test/java/org/jabref/gui/linkedfile/DownloadLinkedFileActionTest.java

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.jabref.gui.linkedfile;
22

3-
import java.io.File;
43
import java.net.CookieHandler;
54
import java.net.CookieManager;
65
import java.net.CookiePolicy;
@@ -24,22 +23,29 @@
2423
import org.jabref.model.entry.BibEntry;
2524
import org.jabref.model.entry.LinkedFile;
2625

26+
import com.github.tomakehurst.wiremock.WireMockServer;
27+
import org.junit.jupiter.api.AfterEach;
2728
import org.junit.jupiter.api.BeforeEach;
2829
import org.junit.jupiter.api.Test;
2930
import org.junit.jupiter.api.io.TempDir;
3031
import org.junit.jupiter.params.ParameterizedTest;
3132
import org.junit.jupiter.params.provider.ValueSource;
3233

34+
import static com.github.tomakehurst.wiremock.client.WireMock.aResponse;
35+
import static com.github.tomakehurst.wiremock.client.WireMock.configureFor;
36+
import static com.github.tomakehurst.wiremock.client.WireMock.get;
37+
import static com.github.tomakehurst.wiremock.client.WireMock.head;
38+
import static com.github.tomakehurst.wiremock.client.WireMock.stubFor;
39+
import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo;
3340
import static org.junit.jupiter.api.Assertions.assertEquals;
3441
import static org.mockito.ArgumentMatchers.any;
3542
import static org.mockito.Mockito.mock;
3643
import static org.mockito.Mockito.when;
3744

3845
class DownloadLinkedFileActionTest {
3946

40-
// Required for keepsHtmlEntry
4147
@TempDir
42-
Path tempFolder;
48+
private Path tempFolder;
4349

4450
private BibEntry entry;
4551

@@ -49,6 +55,8 @@ class DownloadLinkedFileActionTest {
4955
private final FilePreferences filePreferences = mock(FilePreferences.class);
5056
private final GuiPreferences preferences = mock(GuiPreferences.class);
5157

58+
private WireMockServer wireMockServer;
59+
5260
@BeforeEach
5361
void setUp(@TempDir Path tempFolder) throws Exception {
5462
entry = new BibEntry()
@@ -70,6 +78,15 @@ void setUp(@TempDir Path tempFolder) throws Exception {
7078
cookieManager = (CookieManager) CookieHandler.getDefault();
7179
}
7280
cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
81+
82+
wireMockServer = new WireMockServer(2331);
83+
wireMockServer.start();
84+
configureFor("localhost", 2331);
85+
}
86+
87+
@AfterEach
88+
void tearDown() {
89+
wireMockServer.stop();
7390
}
7491

7592
@Test
@@ -122,10 +139,10 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception {
122139

123140
linkedFile = entry.getFiles().getFirst();
124141

125-
File downloadedFile = new File(linkedFile.getLink());
142+
Path downloadedFile = Path.of(linkedFile.getLink());
126143

127144
// Verify that re-downloading the file after the first download doesn't modify the entry
128-
downloadedFile.delete();
145+
Files.delete(downloadedFile);
129146

130147
DownloadLinkedFileAction downloadLinkedFileAction2 = new DownloadLinkedFileAction(
131148
databaseContext,
@@ -144,10 +161,19 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception {
144161
}
145162

146163
@Test
147-
void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception {
148-
String url = "https://blog.fefe.de/?ts=98e04151";
149-
150-
LinkedFile linkedFile = new LinkedFile(new URL(url), "");
164+
void keepsHtmlFileLink(@TempDir Path tempFolder) throws Exception {
165+
stubFor(get(urlEqualTo("/html"))
166+
.willReturn(aResponse()
167+
.withStatus(200)
168+
.withHeader("Content-Type", "text/html; charset=utf-8")
169+
.withBody("<html><body><h1>Hi</h1></body></html>")));
170+
171+
stubFor(head(urlEqualTo("/html"))
172+
.willReturn(aResponse()
173+
.withStatus(200)
174+
.withHeader("Content-Type", "text/html; charset=utf-8")));
175+
176+
LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), "");
151177
when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder));
152178
when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]");
153179
when(filePreferences.getFileDirectoryPattern()).thenReturn("");
@@ -171,10 +197,19 @@ void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception {
171197
}
172198

173199
@Test
174-
void removesHtmlEntry(@TempDir Path tempFolder) throws Exception {
175-
String url = "https://blog.fefe.de/?ts=98e04151";
176-
177-
LinkedFile linkedFile = new LinkedFile(new URL(url), "");
200+
void removesHtmlFileLink(@TempDir Path tempFolder) throws Exception {
201+
stubFor(get(urlEqualTo("/html"))
202+
.willReturn(aResponse()
203+
.withStatus(200)
204+
.withHeader("Content-Type", "text/html; charset=utf-8")
205+
.withBody("<html><body><h1>Hi</h1></body></html>")));
206+
207+
stubFor(head(urlEqualTo("/html"))
208+
.willReturn(aResponse()
209+
.withStatus(200)
210+
.withHeader("Content-Type", "text/html; charset=utf-8")));
211+
212+
LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), "");
178213
when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder));
179214
when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]");
180215
when(filePreferences.getFileDirectoryPattern()).thenReturn("");

0 commit comments

Comments
 (0)