Use regexp to remove non-ASCII characters from DOI and inform user wh…

…en data for valid DOI does not exist #8127 (#8228)
JabRef · Nov 13, 2021 · 2929900 · 2929900
1 parent 079a2d2
commit 2929900
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - When determining the URL of an ArXiV eprint, the URL now points to the version [#8149](https://github.com/JabRef/jabref/pull/8149)
 - We Included all standard fields with citation key when exporting to Old OpenOffice/LibreOffice Calc Format [#8176](https://github.com/JabRef/jabref/pull/8176)
 - We present options to manually enter an article or return to the New Entry menu when the fetcher DOI fails to find an entry for an ID [#7870](https://github.com/JabRef/jabref/issues/7870)
+- We trim white space and non-ASCII characters from DOI [#8127](https://github.com/JabRef/jabref/issues/8127)
 
 ### Fixed
 

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java
@@ -62,6 +62,7 @@ public Optional<HelpFile> getHelpPage() {
     @Override
     public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
         Optional<DOI> doi = DOI.parse(identifier);
+
         try {
             if (doi.isPresent()) {
                 Optional<BibEntry> fetchedEntry;
@@ -70,13 +71,18 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
                 if (getAgency(doi.get()).isPresent() && "medra".equalsIgnoreCase(getAgency(doi.get()).get())) {
                     return new Medra().performSearchById(identifier);
                 }
-
                 URL doiURL = new URL(doi.get().getURIAsASCIIString());
 
                 // BibTeX data
                 URLDownload download = getUrlDownload(doiURL);
                 download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
-                String bibtexString = download.asString();
+                String bibtexString;
+                try {
+                    bibtexString = download.asString();
+                } catch (IOException e) {
+                    // an IOException will be thrown if download is unable to download from the doiURL
+                    throw new FetcherException(Localization.lang("No DOI data exists"), e);
+                }
 
                 // BibTeX entry
                 fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences, new DummyFileUpdateMonitor());

diff --git a/src/main/java/org/jabref/model/entry/identifier/DOI.java b/src/main/java/org/jabref/model/entry/identifier/DOI.java
@@ -88,6 +88,14 @@ public class DOI implements Identifier {
     private static final Pattern FIND_SHORT_DOI_SHORTCUT = Pattern.compile(IN_TEXT_SHORT_DOI_SHORTCUT, Pattern.CASE_INSENSITIVE); // eg doi.org/bfrhmx (no "10/")
     private static final Pattern EXACT_SHORT_DOI_PATT = Pattern.compile(SHORT_DOI_EXP_PREFIX + SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
     private static final Pattern FIND_SHORT_DOI_PATT = Pattern.compile("(?:https?://[^\\s]+?)?" + FIND_SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
+
+    // See https://www.baeldung.com/java-regex-s-splus for explanation of \\s+
+    // See https://stackoverflow.com/questions/3203190/regex-any-ascii-character for the regexp that includes ASCII characters only
+    // Another reference for regexp for ASCII characters: https://howtodoinjava.com/java/regex/java-clean-ascii-text-non-printable-chars/
+    private static final String CHARS_TO_REMOVE = "[\\s+" // remove white space characters, i.e, \t, \n, \x0B, \f, \r . + is a greedy quantifier
+                                                + "[^\\x00-\\x7F]" // strips off all non-ASCII characters
+                                                + "]";
+
     // DOI
     private final String doi;
     // Short DOI
@@ -151,8 +159,9 @@ public DOI(String doi) {
      */
     public static Optional<DOI> parse(String doi) {
         try {
-            String cleanedDOI = doi.trim();
-            cleanedDOI = doi.replaceAll(" ", "");
+            String cleanedDOI = doi;
+            cleanedDOI = cleanedDOI.replaceAll(CHARS_TO_REMOVE, "");
+
             return Optional.of(new DOI(cleanedDOI));
         } catch (IllegalArgumentException | NullPointerException e) {
             return Optional.empty();

diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
@@ -555,6 +555,8 @@ No\ journal\ names\ could\ be\ abbreviated.=No journal names could be abbreviate
 
 No\ journal\ names\ could\ be\ unabbreviated.=No journal names could be unabbreviated.
 
+No\ DOI\ data\ exists=No DOI data exists
+
 not=not
 
 not\ found=not found

diff --git a/src/test/java/org/jabref/model/entry/identifier/DOITest.java b/src/test/java/org/jabref/model/entry/identifier/DOITest.java
@@ -118,6 +118,13 @@ private static Stream<Arguments> testData() {
                 Arguments.of("https://doi.org/10.1109/VLHCC.2004.20", DOI.parse("https : / / doi.org / 10 .1109 /V LHCC.20 04.20").get().getURIAsASCIIString()),
                 // parse short DOI with whitespace
                 Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("https : / / doi.org / 10 / gf4gqc").get().getURIAsASCIIString()),
+                // parse DOI with non-ASCII characters and whitespace
+                Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("�https : \n  ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getURIAsASCIIString()),
+                Arguments.of("10/gf4gqc", DOI.parse("�https : \n  ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getDOI()),
+                Arguments.of("10/gf4gqc", DOI.parse(" 10 / gf4gqc ").get().getDOI()),
+                Arguments.of("10.3218/3846-0", DOI.parse(" �10.3218\n/384␛6-0�").get().getDOI()),
+                // parse already-cleaned DOI
+                Arguments.of("10.3218/3846-0", DOI.parse("10.3218/3846-0").get().getDOI()),
 
                 // correctlyEncodeDOIs
                 // See http://www.doi.org/doi_handbook/2_Numbering.html#2.5.2.4
-Original file line number
+Diff line change
@@ Expand Up @@
     No\ journal\ names\ could\ be\ unabbreviated.=No journal names could be unabbreviated.
+    No\ DOI\ data\ exists=No DOI data exists
     not=not
     not\ found=not found
@@ Expand Down @@