Skip to content

Commit

Permalink
Enhance DOI parser to deal with special characters (#10989)
Browse files Browse the repository at this point in the history
* Enhance DOI parser to deal with special characters

* add a comment for more clarification
  • Loading branch information
AbdAlRahmanGad authored Mar 6, 2024
1 parent 985b40b commit 79ba1f0
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/main/java/org/jabref/model/entry/identifier/DOI.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ public class DOI implements Identifier {
// See https://stackoverflow.com/questions/3203190/regex-any-ascii-character for the regexp that includes ASCII characters only
// Another reference for regexp for ASCII characters: https://howtodoinjava.com/java/regex/java-clean-ascii-text-non-printable-chars/
private static final String CHARS_TO_REMOVE = "[\\s+" // remove white space characters, i.e, \t, \n, \x0B, \f, \r . + is a greedy quantifier
+ "\\\\" // remove backslashes
+ "{}" // remove curly brackets
+ "\\[\\]`|" // remove square brackets, backticks, and pipes
+ "[^\\x00-\\x7F]" // strips off all non-ASCII characters
+ "]";

Expand Down Expand Up @@ -167,6 +170,8 @@ public static Optional<DOI> parse(String doi) {
LatexToUnicodeFormatter formatter = new LatexToUnicodeFormatter();
String cleanedDOI = doi;
cleanedDOI = URLDecoder.decode(cleanedDOI, StandardCharsets.UTF_8);
// needs to be handled before LatexToUnicode, because otherwise `^` will be treated as conversion superscript
cleanedDOI = cleanedDOI.replaceAll("\\^", "");
cleanedDOI = formatter.format(cleanedDOI);
cleanedDOI = cleanedDOI.replaceAll(CHARS_TO_REMOVE, "");

Expand Down
18 changes: 18 additions & 0 deletions src/test/java/org/jabref/model/entry/identifier/DOITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,24 @@ private static Stream<Arguments> testData() {
Arguments.of("10/gf4gqc", DOI.parse("�https : \n ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getDOI()),
Arguments.of("10/gf4gqc", DOI.parse(" 10 / gf4gqc ").get().getDOI()),
Arguments.of("10.3218/3846-0", DOI.parse(" �10.3218\n/384␛6-0�").get().getDOI()),
// parse DOI with backslashes
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/978-3-030-02671-4\\_7").get().getDOI()),
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/\\978-3-03\\0-02671-4\\_7").get().getDOI()),
Arguments.of("https://doi.org/10.1007/978-3-030-02671-4_7", DOI.parse("https://doi.org/10.\\\\1007/9\\\\78-3\\\\-030-026\\\\\\71-4_7").get().getURIAsASCIIString()),
// parse DOI with {}
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/9{}78{-3{-03{0-0}}}26}{71-4}_7").get().getDOI()),
// parse DOI with `
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/9`78`-3`-03`0-0``26````71-4}_7").get().getDOI()),
// parse DOI with |
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/9||78|-3|-03|0-0|26|71-4|||_7").get().getDOI()),
// parse DOI with ~
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/9~~~78~-3~-03~0-0~26~71-4~_7").get().getDOI()),
// parse DOI with []
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1007/][9[][]78-3-03[[]0-02671-4_7").get().getDOI()),
// parse DOI with ^
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("^^^10.10^07/978-3^-0^30-02671-4_7").get().getDOI()),
// parse DOI with special characters
Arguments.of("10.1007/978-3-030-02671-4_7", DOI.parse("10.1^00^^7/9|~^]`7^8-3~[[[]]-0^3]~0-0~26``71-4~||_7").get().getDOI()),
// parse already-cleaned DOI
Arguments.of("10.3218/3846-0", DOI.parse("10.3218/3846-0").get().getDOI()),

Expand Down

0 comments on commit 79ba1f0

Please sign in to comment.