-
-
Notifications
You must be signed in to change notification settings - Fork 3k
Add option to parse new references from plain text using GROBID service [solving #4826] #5614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
724d499
d9f938d
9af46ab
28201e4
7d8ea0a
29d8c84
83106ff
f574395
23bb8ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| package org.jabref.logic.importer.fetcher; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
| import java.util.Optional; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import org.jabref.logic.importer.ImportFormatPreferences; | ||
| import org.jabref.logic.importer.ParseException; | ||
| import org.jabref.logic.importer.SearchBasedFetcher; | ||
| import org.jabref.logic.importer.fileformat.BibtexParser; | ||
| import org.jabref.logic.importer.util.GrobidService; | ||
| import org.jabref.model.entry.BibEntry; | ||
| import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
|
||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| public class GrobidCitationFetcher implements SearchBasedFetcher { | ||
|
|
||
| private static final Logger LOGGER = LoggerFactory.getLogger(GrobidCitationFetcher.class); | ||
| private static final String GROBID_URL = "http://grobid.cm.in.tum.de:8070"; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please change to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This address is not working for some reason... |
||
| private ImportFormatPreferences importFormatPreferences; | ||
| private GrobidService grobidService; | ||
|
|
||
| public GrobidCitationFetcher(ImportFormatPreferences importFormatPreferences) { | ||
| this.importFormatPreferences = importFormatPreferences; | ||
| this.grobidService = new GrobidService(GROBID_URL); | ||
| } | ||
|
|
||
| /** | ||
| * Passes request to grobid server, using consolidateCitations option to improve result. | ||
| * Takes a while, since the server has to look up the entry. | ||
| * @return A BibTeX-String if extraction is successful and an empty String otherwise. | ||
| */ | ||
NikodemKch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| private String parseUsingGrobid(String plainText) { | ||
| try { | ||
| return grobidService.processCitation(plainText, GrobidService.ConsolidateCitations.WITH_METADATA); | ||
| } catch (IOException e) { | ||
| LOGGER.debug("Could not process citation", e); | ||
| return ""; | ||
| } | ||
| } | ||
|
|
||
| private Optional<BibEntry> parseBibToBibEntry(String bibtexString) { | ||
| try { | ||
| return BibtexParser.singleFromString(bibtexString, | ||
| importFormatPreferences, new DummyFileUpdateMonitor()); | ||
| } catch (ParseException e) { | ||
| return Optional.empty(); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public List<BibEntry> performSearch(String query) { | ||
| List<String> plainReferences = Arrays.stream( query.split( "\\r\\r+|\\n\\n+|\\r\\n(\\r\\n)+" ) ) | ||
| .map(String::trim) | ||
| .filter(str -> !str.isBlank()) | ||
| .collect(Collectors.toCollection(ArrayList::new)); | ||
| if (plainReferences.isEmpty()) { | ||
| return Collections.emptyList(); | ||
| } else { | ||
| return plainReferences.stream() | ||
| .map(reference -> parseBibToBibEntry(parseUsingGrobid(reference))) | ||
| .flatMap(Optional::stream) | ||
| .collect(Collectors.toList()); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public String getName() { | ||
| return "GROBID"; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| package org.jabref.logic.importer.util; | ||
|
|
||
| import java.io.IOException; | ||
| import java.net.URLEncoder; | ||
| import java.nio.charset.StandardCharsets; | ||
|
|
||
| import org.jabref.logic.net.URLDownload; | ||
|
|
||
| /** | ||
| * Implements an API to a GROBID server, as described at | ||
| * https://grobid.readthedocs.io/en/latest/Grobid-service/#grobid-web-services | ||
| * <p> | ||
| * Note: Currently a custom GROBID server is used... | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it would be a good idea to have the custom GROBID server also as a repository under the JabRef umbrella organization. What do you think? refs also @koppor
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure this makes sense. The Grobid server is here: https://github.com/kermitt2/grobid
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once this merge request is accepted we can also use the default grobid server (Thank you @koppor ): https://github.com/kermitt2/grobid/pull/532/files There is also a default server hosted by grobid itself (http://cloud.science-miner.com/grobid/), but it does never work.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please ensure that you send an
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added the header, but commented out with a TODO comment, since GROBID does not work with this header right now |
||
| * https://github.com/NikodemKch/grobid | ||
| * <p> | ||
| * The methods are structured to match the GROBID server api. | ||
| * Each method corresponds to a GROBID service request. Only the ones already used are already implemented. | ||
| */ | ||
| public class GrobidService { | ||
|
|
||
| public enum ConsolidateCitations { | ||
| NO(0), WITH_METADATA(1), WITH_DOI_ONLY(2); | ||
| private int code; | ||
|
|
||
| ConsolidateCitations(int code) { | ||
| this.code = code; | ||
| } | ||
|
|
||
| public int getCode() { | ||
| return this.code; | ||
| } | ||
| } | ||
|
|
||
| String grobidServerURL; | ||
|
|
||
| public GrobidService(String grobidServerURL) { | ||
| this.grobidServerURL = grobidServerURL; | ||
| } | ||
|
|
||
| /** | ||
| * @return A BibTeX-String if extraction is successful and an IOException otherwise. | ||
| */ | ||
| public String processCitation(String rawCitation, ConsolidateCitations consolidateCitations) throws IOException { | ||
| rawCitation = URLEncoder.encode(rawCitation, StandardCharsets.UTF_8); | ||
| URLDownload urlDownload = new URLDownload(grobidServerURL | ||
| + "/api/processCitation"); | ||
| //urlDownload.addHeader("Accept", "application/x-bibtex"); //TODO: Uncomment as soon as the default GROBID server is used. | ||
| urlDownload.setPostData("citations=" + rawCitation + "&consolidateCitations=" + consolidateCitations); | ||
tobiasdiez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| String httpResponse = urlDownload.asString(); | ||
|
|
||
| if (httpResponse == null || httpResponse.equals("@misc{-1,\n\n}\n")) { //This filters empty BibTeX entries | ||
| throw new IOException("The GROBID server response does not contain anything."); | ||
| } | ||
|
|
||
| return httpResponse; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.