Skip to content

Commit a1027de

Browse files
committed
Change format for study definition to yamlm and adapt tests
Make long queries safe Signed-off-by: Dominik Voigt <[email protected]>
1 parent b19c3e4 commit a1027de

File tree

18 files changed

+487
-369
lines changed

18 files changed

+487
-369
lines changed

build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ dependencies {
141141

142142
implementation group: 'org.eclipse.jgit', name: 'org.eclipse.jgit', version: '5.9.0.202009080501-r'
143143

144+
implementation group: 'com.fasterxml.jackson.dataformat', name: 'jackson-dataformat-yaml', version: '2.12.0-rc2'
145+
implementation group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jsr310', version: '2.12.0-rc2'
146+
144147
implementation group: 'org.mariadb.jdbc', name: 'mariadb-java-client', version: '2.7.0'
145148

146149
implementation 'org.postgresql:postgresql:42.2.18'
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Use Jackson to parse study.yml
2+
3+
## Context and Problem Statement
4+
5+
The study definition file is formulated as a YAML document.
6+
To accessed the definition within JabRef this document has to be parsed.
7+
What parser should be used to parse YAML files?
8+
9+
## Considered Options
10+
11+
* Jackson
12+
* SnakeYAML Engine
13+
* yamlbeans
14+
* eo-yaml
15+
* Own parser
16+
17+
## Decision Outcome
18+
19+
Chosen option: Jackson, because as it is a dedicated library for parsing YAML. yamlbeans also seem to be viable. They all offer similar functionality
20+
21+
## Pros and Cons of the Options
22+
23+
### Jackson
24+
25+
* Good, because established YAML parser library
26+
* Good, because supports YAML 1.2
27+
* Good, because it can parse LocalDate
28+
29+
### SnakeYAML Engine
30+
31+
* Good, because established YAML parser library
32+
* Good, because supports YAML 1.2
33+
* Bad, because cannot parse YAML into Java DTOs
34+
35+
### yamlbeans
36+
37+
* Good, because established YAML parser library
38+
* Good, because [nice getting started page](https://github.com/EsotericSoftware/yamlbeans)
39+
* Bad, because objects need to be annotated in the yaml file to be parsed into Java objects
40+
41+
### eo-yaml
42+
43+
* Good, because established YAML parser library
44+
* Good, because supports YAML 1.2
45+
* Bad, because cannot parse YAML into Java DTOs?
46+
47+
### Own parser
48+
49+
* Good, because easily customizable
50+
* Bad, because high effort
51+
* Bad, because has to be tested extensively

src/main/java/module-info.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,7 @@
9292
requires lucene.queryparser;
9393
requires lucene.core;
9494
requires org.eclipse.jgit;
95+
requires com.fasterxml.jackson.databind;
96+
requires com.fasterxml.jackson.dataformat.yaml;
97+
requires com.fasterxml.jackson.datatype.jsr310;
9598
}

src/main/java/org/jabref/logic/crawler/Crawler.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.jabref.logic.importer.ParseException;
1111
import org.jabref.model.entry.BibEntryTypesManager;
1212
import org.jabref.model.study.QueryResult;
13-
import org.jabref.model.study.Study;
1413
import org.jabref.model.util.FileUpdateMonitor;
1514

1615
import org.eclipse.jgit.api.errors.GitAPIException;
@@ -34,9 +33,8 @@ public class Crawler {
3433
public Crawler(Path studyDefinitionFile, FileUpdateMonitor fileUpdateMonitor, ImportFormatPreferences importFormatPreferences, SavePreferences savePreferences, BibEntryTypesManager bibEntryTypesManager) throws IllegalArgumentException, IOException, ParseException, GitAPIException {
3534
Path studyRepositoryRoot = studyDefinitionFile.getParent();
3635
studyRepository = new StudyRepository(studyRepositoryRoot, new GitHandler(studyRepositoryRoot), importFormatPreferences, fileUpdateMonitor, savePreferences, bibEntryTypesManager);
37-
Study study = studyRepository.getStudy();
38-
LibraryEntryToFetcherConverter libraryEntryToFetcherConverter = new LibraryEntryToFetcherConverter(study.getActiveLibraryEntries(), importFormatPreferences);
39-
this.studyFetcher = new StudyFetcher(libraryEntryToFetcherConverter.getActiveFetchers(), study.getSearchQueryStrings());
36+
LibraryEntryToFetcherConverter libraryEntryToFetcherConverter = new LibraryEntryToFetcherConverter(studyRepository.getActiveLibraryEntries(), importFormatPreferences);
37+
this.studyFetcher = new StudyFetcher(libraryEntryToFetcherConverter.getActiveFetchers(), studyRepository.getSearchQueryStrings());
4038
}
4139

4240
/**

src/main/java/org/jabref/logic/crawler/LibraryEntryToFetcherConverter.java

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,16 @@
88
import org.jabref.logic.importer.ImportFormatPreferences;
99
import org.jabref.logic.importer.SearchBasedFetcher;
1010
import org.jabref.logic.importer.WebFetchers;
11-
import org.jabref.model.entry.BibEntry;
12-
import org.jabref.model.entry.field.UnknownField;
13-
14-
import static org.jabref.model.entry.types.SystematicLiteratureReviewStudyEntryType.LIBRARY_ENTRY;
11+
import org.jabref.model.study.LibraryEntry;
1512

1613
/**
1714
* Converts library entries from the given study into their corresponding fetchers.
1815
*/
1916
class LibraryEntryToFetcherConverter {
20-
private final List<BibEntry> libraryEntries;
17+
private final List<LibraryEntry> libraryEntries;
2118
private final ImportFormatPreferences importFormatPreferences;
2219

23-
public LibraryEntryToFetcherConverter(List<BibEntry> libraryEntries, ImportFormatPreferences importFormatPreferences) {
20+
public LibraryEntryToFetcherConverter(List<LibraryEntry> libraryEntries, ImportFormatPreferences importFormatPreferences) {
2421
this.libraryEntries = libraryEntries;
2522
this.importFormatPreferences = importFormatPreferences;
2623
}
@@ -42,9 +39,8 @@ public List<SearchBasedFetcher> getActiveFetchers() {
4239
* @param libraryEntries List of entries
4340
* @return List of fetcher instances
4441
*/
45-
private List<SearchBasedFetcher> getFetchersFromLibraryEntries(List<BibEntry> libraryEntries) {
42+
private List<SearchBasedFetcher> getFetchersFromLibraryEntries(List<LibraryEntry> libraryEntries) {
4643
return libraryEntries.parallelStream()
47-
.filter(bibEntry -> bibEntry.getType().getName().equals(LIBRARY_ENTRY.getName()))
4844
.map(this::createFetcherFromLibraryEntry)
4945
.filter(Objects::nonNull)
5046
.collect(Collectors.toList());
@@ -56,9 +52,9 @@ private List<SearchBasedFetcher> getFetchersFromLibraryEntries(List<BibEntry> li
5652
* @param libraryEntry the entry that will be converted
5753
* @return An instance of the fetcher defined by the library entry.
5854
*/
59-
private SearchBasedFetcher createFetcherFromLibraryEntry(BibEntry libraryEntry) {
55+
private SearchBasedFetcher createFetcherFromLibraryEntry(LibraryEntry libraryEntry) {
6056
Set<SearchBasedFetcher> searchBasedFetchers = WebFetchers.getSearchBasedFetchers(importFormatPreferences);
61-
String libraryNameFromFetcher = libraryEntry.getField(new UnknownField("name")).orElse("");
57+
String libraryNameFromFetcher = libraryEntry.getName();
6258
return searchBasedFetchers.stream()
6359
.filter(searchBasedFetcher -> searchBasedFetcher.getName().toLowerCase().equals(libraryNameFromFetcher.toLowerCase()))
6460
.findAny()

src/main/java/org/jabref/logic/crawler/StudyRepository.java

Lines changed: 50 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@
22

33
import java.io.FileWriter;
44
import java.io.IOException;
5-
import java.io.InputStream;
65
import java.io.Writer;
76
import java.nio.file.Files;
87
import java.nio.file.Path;
98
import java.time.LocalDate;
10-
import java.util.ArrayList;
119
import java.util.List;
12-
import java.util.Optional;
1310
import java.util.regex.Pattern;
1411
import java.util.stream.Collectors;
1512

@@ -22,14 +19,12 @@
2219
import org.jabref.logic.importer.OpenDatabase;
2320
import org.jabref.logic.importer.ParseException;
2421
import org.jabref.logic.importer.SearchBasedFetcher;
25-
import org.jabref.logic.importer.fileformat.BibtexParser;
2622
import org.jabref.model.database.BibDatabase;
2723
import org.jabref.model.database.BibDatabaseContext;
28-
import org.jabref.model.entry.BibEntry;
2924
import org.jabref.model.entry.BibEntryTypesManager;
30-
import org.jabref.model.entry.field.UnknownField;
31-
import org.jabref.model.entry.types.SystematicLiteratureReviewStudyEntryType;
3225
import org.jabref.model.study.FetchResult;
26+
import org.jabref.model.study.LibraryEntry;
27+
import org.jabref.model.study.QueryEntry;
3328
import org.jabref.model.study.QueryResult;
3429
import org.jabref.model.study.Study;
3530
import org.jabref.model.util.FileUpdateMonitor;
@@ -48,13 +43,13 @@
4843
*/
4944
class StudyRepository {
5045
// Tests work with study.bib
51-
private static final String STUDY_DEFINITION_FILE_NAME = "study.bib";
46+
private static final String STUDY_DEFINITION_FILE_NAME = "study.yml";
5247
private static final Logger LOGGER = LoggerFactory.getLogger(StudyRepository.class);
5348
private static final Pattern MATCHCOLON = Pattern.compile(":");
5449
private static final Pattern MATCHILLEGALCHARACTERS = Pattern.compile("[^A-Za-z0-9_.\\s=-]");
5550

5651
private final Path repositoryPath;
57-
private final Path studyDefinitionBib;
52+
private final Path studyDefinitionFile;
5853
private final GitHandler gitHandler;
5954
private final Study study;
6055
private final ImportFormatPreferences importFormatPreferences;
@@ -81,13 +76,13 @@ public StudyRepository(Path pathToRepository, GitHandler gitHandler, ImportForma
8176
}
8277
this.importFormatPreferences = importFormatPreferences;
8378
this.fileUpdateMonitor = fileUpdateMonitor;
84-
this.studyDefinitionBib = Path.of(repositoryPath.toString(), STUDY_DEFINITION_FILE_NAME);
79+
this.studyDefinitionFile = Path.of(repositoryPath.toString(), STUDY_DEFINITION_FILE_NAME);
8580
this.savePreferences = savePreferences;
8681
this.bibEntryTypesManager = bibEntryTypesManager;
8782

8883
if (Files.notExists(repositoryPath)) {
8984
throw new IOException("The given repository does not exists.");
90-
} else if (Files.notExists(studyDefinitionBib)) {
85+
} else if (Files.notExists(studyDefinitionFile)) {
9186
throw new IOException("The study definition file does not exist in the given repository.");
9287
}
9388
study = parseStudyFile();
@@ -116,30 +111,39 @@ public BibDatabaseContext getStudyResultEntries() throws IOException {
116111
}
117112

118113
/**
119-
* The study definition file contains all the definitions of a study. This method extracts the BibEntries from the study BiB file.
114+
* The study definition file contains all the definitions of a study. This method extracts this study from the yaml study definition file
120115
*
121116
* @return Returns the BibEntries parsed from the study definition file.
122117
* @throws IOException Problem opening the input stream.
123118
* @throws ParseException Problem parsing the study definition file.
124119
*/
125-
private Study parseStudyFile() throws IOException, ParseException {
126-
BibtexParser parser = new BibtexParser(importFormatPreferences, fileUpdateMonitor);
127-
List<BibEntry> parsedEntries = new ArrayList<>();
128-
try (InputStream inputStream = Files.newInputStream(studyDefinitionBib)) {
129-
parsedEntries.addAll(parser.parseEntries(inputStream));
130-
}
120+
private Study parseStudyFile() throws IOException {
121+
return new StudyYAMLParser().parseStudyYAMLFile(studyDefinitionFile);
122+
}
123+
124+
/**
125+
* Returns all query strings of the study definition
126+
*
127+
* @return List of all queries as Strings.
128+
*/
129+
public List<String> getSearchQueryStrings() {
130+
return study.getQueries()
131+
.parallelStream()
132+
.map(QueryEntry::getQuery)
133+
.collect(Collectors.toList());
134+
}
131135

132-
BibEntry studyEntry = parsedEntries.parallelStream()
133-
.filter(bibEntry -> bibEntry.getType().equals(SystematicLiteratureReviewStudyEntryType.STUDY_ENTRY)).findAny()
134-
.orElseThrow(() -> new ParseException("Study definition file does not contain a study entry"));
135-
List<BibEntry> queryEntries = parsedEntries.parallelStream()
136-
.filter(bibEntry -> bibEntry.getType().equals(SystematicLiteratureReviewStudyEntryType.SEARCH_QUERY_ENTRY))
137-
.collect(Collectors.toList());
138-
List<BibEntry> libraryEntries = parsedEntries.parallelStream()
139-
.filter(bibEntry -> bibEntry.getType().equals(SystematicLiteratureReviewStudyEntryType.LIBRARY_ENTRY))
140-
.collect(Collectors.toList());
141-
142-
return new Study(studyEntry, queryEntries, libraryEntries);
136+
/**
137+
* Extracts all active fetchers from the library entries.
138+
*
139+
* @return List of BibEntries of type Library
140+
* @throws IllegalArgumentException If a transformation from Library entry to LibraryDefinition fails
141+
*/
142+
public List<LibraryEntry> getActiveLibraryEntries() throws IllegalArgumentException {
143+
return study.getLibraries()
144+
.parallelStream()
145+
.filter(LibraryEntry::isEnabled)
146+
.collect(Collectors.toList());
143147
}
144148

145149
public Study getStudy() {
@@ -163,16 +167,16 @@ public void persist(List<QueryResult> crawlResults) throws IOException, GitAPIEx
163167
}
164168

165169
private void persistStudy() throws IOException {
166-
writeResultToFile(studyDefinitionBib, new BibDatabase(study.getAllEntries()));
170+
new StudyYAMLParser().writeStudyYAMLFile(study, studyDefinitionFile);
167171
}
168172

169173
/**
170174
* Create for each query a folder, and for each fetcher a bib file in the query folder to store its results.
171175
*/
172176
private void setUpRepositoryStructure() throws IOException {
173177
// Cannot use stream here since IOException has to be thrown
174-
LibraryEntryToFetcherConverter converter = new LibraryEntryToFetcherConverter(study.getActiveLibraryEntries(), importFormatPreferences);
175-
for (String query : study.getSearchQueryStrings()) {
178+
LibraryEntryToFetcherConverter converter = new LibraryEntryToFetcherConverter(this.getActiveLibraryEntries(), importFormatPreferences);
179+
for (String query : this.getSearchQueryStrings()) {
176180
createQueryResultFolder(query);
177181
converter.getActiveFetchers()
178182
.forEach(searchBasedFetcher -> createFetcherResultFile(query, searchBasedFetcher));
@@ -229,14 +233,14 @@ private void createBibFile(Path file) {
229233
* Structure: ID-trimmed query
230234
*
231235
* Examples:
232-
* Input: '(title: test-title AND abstract: Test)' as a query entry with id 1
233-
* Output: '1 - title= test-title AND abstract= Test'
236+
* Input: '(title: test-title AND abstract: Test)' as a query entry with id 12345678
237+
* Output: '12345678 - title= test-title AND abstract= Test'
234238
*
235-
* Input: 'abstract: Test*' as a query entry with id 1
236-
* Output: '1 - abstract= Test'
239+
* Input: 'abstract: Test*' as a query entry with id 87654321
240+
* Output: '87654321 - abstract= Test'
237241
*
238-
* Input: '"test driven"' as a query entry with id 1
239-
* Output: '1 - test driven'
242+
* Input: '"test driven"' as a query entry with id 12348765
243+
* Output: '12348765 - test driven'
240244
*
241245
* @param query that is trimmed and combined with its query id
242246
* @return a unique folder name for any query.
@@ -245,31 +249,20 @@ private String trimNameAndAddID(String query) {
245249
// Replace all field: with field= for folder name
246250
String trimmedNamed = MATCHCOLON.matcher(query).replaceAll("=");
247251
trimmedNamed = MATCHILLEGALCHARACTERS.matcher(trimmedNamed).replaceAll("");
248-
if (query.length() > 240) {
249-
trimmedNamed = query.substring(0, 240);
252+
String id = computeIDForQuery(query);
253+
// Whole path has to be shorter than 260
254+
int remainingPathLength = 220 - studyDefinitionFile.toString().length() - id.length();
255+
if (query.length() > remainingPathLength) {
256+
trimmedNamed = query.substring(0, remainingPathLength);
250257
}
251-
String id = findQueryIDByQueryString(query);
252258
return id + " - " + trimmedNamed;
253259
}
254260

255261
/**
256-
* Helper to find the query id for folder name creation.
257-
* Returns the id of the first SearchQuery BibEntry with a query field that matches the given query.
258-
*
259-
* @param query The query whose ID is searched
260-
* @return ID of the query defined in the study definition.
262+
* Helper to compute the query id for folder name creation.
261263
*/
262-
private String findQueryIDByQueryString(String query) {
263-
String queryField = "query";
264-
return study.getSearchQueryEntries()
265-
.parallelStream()
266-
.filter(bibEntry -> bibEntry.getField(new UnknownField(queryField)).orElse("").equals(query))
267-
.map(BibEntry::getCitationKey)
268-
.filter(Optional::isPresent)
269-
.map(Optional::get)
270-
.findFirst()
271-
.orElseThrow()
272-
.replaceFirst(queryField, "");
264+
private String computeIDForQuery(String query) {
265+
return String.valueOf(query.hashCode());
273266
}
274267

275268
/**
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package org.jabref.logic.crawler;
2+
3+
import java.io.FileInputStream;
4+
import java.io.IOException;
5+
import java.io.InputStream;
6+
import java.nio.file.Path;
7+
8+
import org.jabref.model.study.Study;
9+
10+
import com.fasterxml.jackson.databind.ObjectMapper;
11+
import com.fasterxml.jackson.databind.SerializationFeature;
12+
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
13+
import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator;
14+
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
15+
16+
public class StudyYAMLParser {
17+
18+
/**
19+
* Parses the given yaml study definition file into a study instance
20+
*/
21+
public Study parseStudyYAMLFile(Path studyYAMLFile) throws IOException {
22+
ObjectMapper yamlMapper = new ObjectMapper(new YAMLFactory());
23+
yamlMapper.registerModule(new JavaTimeModule());
24+
try (InputStream fileInputStream = new FileInputStream(studyYAMLFile.toFile())) {
25+
return yamlMapper.readValue(fileInputStream, Study.class);
26+
}
27+
}
28+
29+
/**
30+
* Writes the given study instance into a yaml file to the given path
31+
*/
32+
public void writeStudyYAMLFile(Study study, Path studyYAMLFile) throws IOException {
33+
ObjectMapper yamlMapper = new ObjectMapper(new YAMLFactory().disable(YAMLGenerator.Feature.WRITE_DOC_START_MARKER).enable(YAMLGenerator.Feature.MINIMIZE_QUOTES));
34+
yamlMapper.registerModule(new JavaTimeModule());
35+
yamlMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
36+
yamlMapper.writeValue(studyYAMLFile.toFile(), study);
37+
}
38+
}

0 commit comments

Comments
 (0)