Skip to content

Commit

Permalink
Use regexp to remove non-ASCII characters from DOI and inform user wh…
Browse files Browse the repository at this point in the history
…en data for valid DOI does not exist #8127 (#8228)
  • Loading branch information
mrcstan authored Nov 13, 2021
1 parent 079a2d2 commit 2929900
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- When determining the URL of an ArXiV eprint, the URL now points to the version [#8149](https://github.com/JabRef/jabref/pull/8149)
- We Included all standard fields with citation key when exporting to Old OpenOffice/LibreOffice Calc Format [#8176](https://github.com/JabRef/jabref/pull/8176)
- We present options to manually enter an article or return to the New Entry menu when the fetcher DOI fails to find an entry for an ID [#7870](https://github.com/JabRef/jabref/issues/7870)
- We trim white space and non-ASCII characters from DOI [#8127](https://github.com/JabRef/jabref/issues/8127)

### Fixed

Expand Down
10 changes: 8 additions & 2 deletions src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public Optional<HelpFile> getHelpPage() {
@Override
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
Optional<DOI> doi = DOI.parse(identifier);

try {
if (doi.isPresent()) {
Optional<BibEntry> fetchedEntry;
Expand All @@ -70,13 +71,18 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
if (getAgency(doi.get()).isPresent() && "medra".equalsIgnoreCase(getAgency(doi.get()).get())) {
return new Medra().performSearchById(identifier);
}

URL doiURL = new URL(doi.get().getURIAsASCIIString());

// BibTeX data
URLDownload download = getUrlDownload(doiURL);
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
String bibtexString = download.asString();
String bibtexString;
try {
bibtexString = download.asString();
} catch (IOException e) {
// an IOException will be thrown if download is unable to download from the doiURL
throw new FetcherException(Localization.lang("No DOI data exists"), e);
}

// BibTeX entry
fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences, new DummyFileUpdateMonitor());
Expand Down
13 changes: 11 additions & 2 deletions src/main/java/org/jabref/model/entry/identifier/DOI.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ public class DOI implements Identifier {
private static final Pattern FIND_SHORT_DOI_SHORTCUT = Pattern.compile(IN_TEXT_SHORT_DOI_SHORTCUT, Pattern.CASE_INSENSITIVE); // eg doi.org/bfrhmx (no "10/")
private static final Pattern EXACT_SHORT_DOI_PATT = Pattern.compile(SHORT_DOI_EXP_PREFIX + SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
private static final Pattern FIND_SHORT_DOI_PATT = Pattern.compile("(?:https?://[^\\s]+?)?" + FIND_SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);

// See https://www.baeldung.com/java-regex-s-splus for explanation of \\s+
// See https://stackoverflow.com/questions/3203190/regex-any-ascii-character for the regexp that includes ASCII characters only
// Another reference for regexp for ASCII characters: https://howtodoinjava.com/java/regex/java-clean-ascii-text-non-printable-chars/
private static final String CHARS_TO_REMOVE = "[\\s+" // remove white space characters, i.e, \t, \n, \x0B, \f, \r . + is a greedy quantifier
+ "[^\\x00-\\x7F]" // strips off all non-ASCII characters
+ "]";

// DOI
private final String doi;
// Short DOI
Expand Down Expand Up @@ -151,8 +159,9 @@ public DOI(String doi) {
*/
public static Optional<DOI> parse(String doi) {
try {
String cleanedDOI = doi.trim();
cleanedDOI = doi.replaceAll(" ", "");
String cleanedDOI = doi;
cleanedDOI = cleanedDOI.replaceAll(CHARS_TO_REMOVE, "");

return Optional.of(new DOI(cleanedDOI));
} catch (IllegalArgumentException | NullPointerException e) {
return Optional.empty();
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,8 @@ No\ journal\ names\ could\ be\ abbreviated.=No journal names could be abbreviate

No\ journal\ names\ could\ be\ unabbreviated.=No journal names could be unabbreviated.

No\ DOI\ data\ exists=No DOI data exists

not=not

not\ found=not found
Expand Down
7 changes: 7 additions & 0 deletions src/test/java/org/jabref/model/entry/identifier/DOITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ private static Stream<Arguments> testData() {
Arguments.of("https://doi.org/10.1109/VLHCC.2004.20", DOI.parse("https : / / doi.org / 10 .1109 /V LHCC.20 04.20").get().getURIAsASCIIString()),
// parse short DOI with whitespace
Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("https : / / doi.org / 10 / gf4gqc").get().getURIAsASCIIString()),
// parse DOI with non-ASCII characters and whitespace
Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("�https : \n ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getURIAsASCIIString()),
Arguments.of("10/gf4gqc", DOI.parse("�https : \n ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getDOI()),
Arguments.of("10/gf4gqc", DOI.parse(" 10 / gf4gqc ").get().getDOI()),
Arguments.of("10.3218/3846-0", DOI.parse(" �10.3218\n/384␛6-0�").get().getDOI()),
// parse already-cleaned DOI
Arguments.of("10.3218/3846-0", DOI.parse("10.3218/3846-0").get().getDOI()),

// correctlyEncodeDOIs
// See http://www.doi.org/doi_handbook/2_Numbering.html#2.5.2.4
Expand Down

0 comments on commit 2929900

Please sign in to comment.