From eca13ddb173ba8f49f6324935c90d0db1ccc0001 Mon Sep 17 00:00:00 2001 From: Christoph Date: Mon, 4 Jan 2021 17:37:23 +0100 Subject: [PATCH] Fix Normalize pages formatter not replacing dashes (#7243) Co-authored-by: Oliver Kopp --- CHANGELOG.md | 1 + .../bibtexfields/NormalizePagesFormatter.java | 73 +++++---- .../casechanger/ProtectTermsFormatter.java | 5 + .../casechanger/UnprotectTermsFormatter.java | 63 +++++++ .../casechanger/UpperCaseFormatter.java | 6 +- src/main/resources/l10n/JabRef_en.properties | 2 + .../NormalizePagesFormatterTest.java | 154 +++++++++--------- .../UnprotectTermsFormatterTest.java | 39 +++++ 8 files changed, 225 insertions(+), 118 deletions(-) create mode 100644 src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java create mode 100644 src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 32ce3adaeb0..9f1436c4c5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve ### Fixed +- We fixed an issue where the "Normalize page numbers" formatter did not replace en-dashes or em-dashes with a hyphen-minus sign. [#7239](https://github.com/JabRef/jabref/issues/7239) - We fixed an issue with the style of highlighted check boxes while searching in preferences. [#7226](https://github.com/JabRef/jabref/issues/7226) - We fixed an issue where the option "Move file to file directory" was disabled in the entry editor for all files [#7194](https://github.com/JabRef/jabref/issues/7194) - We fixed an issue where application dialogs were opening in the wrong display when using multiple screens [#7273](https://github.com/JabRef/jabref/pull/7273) diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java index 8a9163b73f5..8bdb2aef9b4 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java @@ -5,27 +5,41 @@ import java.util.regex.Pattern; import org.jabref.logic.cleanup.Formatter; +import org.jabref.logic.formatter.casechanger.UnprotectTermsFormatter; import org.jabref.logic.l10n.Localization; -import com.google.common.base.Strings; - /** * This class includes sensible defaults for consistent formatting of BibTeX page numbers. - * + *

+ * Format page numbers, separated either by commas or double-hyphens. + * Converts the range number format of the pages field to page_number--page_number. + * Removes unwanted literals except letters, numbers and -+ signs. + * Keeps the existing String if the resulting field does not match the expected Regex. + *

* From BibTeX manual: * One or more page numbers or range of numbers, such as 42--111 or 7,41,73--97 or 43+ * (the '+' in this last example indicates pages following that don't form a simple range). * To make it easier to maintain Scribe-compatible databases, the standard styles convert * a single dash (as in 7-33) to the double dash used in TEX to denote number ranges (as in 7--33). + *

+ * Examples: + * + *

*/ public class NormalizePagesFormatter extends Formatter { - // "startpage" and "endpage" are named groups. See http://stackoverflow.com/a/415635/873282 for a documentation - private static final Pattern PAGES_DETECT_PATTERN = Pattern.compile("\\A(?(\\d+:)?\\d+)(?:-{1,2}(?(\\d+:)?\\d+))?\\Z"); + private static final Pattern EM_EN_DASH_PATTERN = Pattern.compile("\u2013|\u2014"); + private static final Pattern DASHES_DETECT_PATTERN = Pattern.compile("[ ]*-+[ ]*"); - private static final String REJECT_LITERALS = "[^a-zA-Z0-9,\\-\\+,:]"; - private static final String PAGES_REPLACE_PATTERN = "${startpage}--${endpage}"; - private static final String SINGLE_PAGE_REPLACE_PATTERN = "$1"; + private final Formatter unprotectTermsFormatter = new UnprotectTermsFormatter(); @Override public String getName() { @@ -37,44 +51,31 @@ public String getKey() { return "normalize_page_numbers"; } - /** - * Format page numbers, separated either by commas or double-hyphens. - * Converts the range number format of the pages field to page_number--page_number. - * Removes unwanted literals except letters, numbers and -+ signs. - * Keeps the existing String if the resulting field does not match the expected Regex. - * - * - * 1-2 -> 1--2 - * 1,2,3 -> 1,2,3 - * {1}-{2} -> 1--2 - * 43+ -> 43+ - * Invalid -> Invalid - * - */ @Override public String format(String value) { Objects.requireNonNull(value); - if (value.isEmpty()) { - // nothing to do return value; } + value = value.trim(); + // Remove pages prefix - String cleanValue = value.replace("pp.", "").replace("p.", ""); - // remove unwanted literals including en dash, em dash, and whitespace - cleanValue = cleanValue.replaceAll("\u2013|\u2014", "-").replaceAll(REJECT_LITERALS, ""); - // try to find pages pattern - Matcher matcher = PAGES_DETECT_PATTERN.matcher(cleanValue); - if (matcher.matches()) { - // replace - if (Strings.isNullOrEmpty(matcher.group("endpage"))) { - return matcher.replaceFirst(SINGLE_PAGE_REPLACE_PATTERN); - } else { - return matcher.replaceFirst(PAGES_REPLACE_PATTERN); + value = value.replace("pp.", "").replace("p.", "").trim(); + + // replace em and en dashes by -- + value = EM_EN_DASH_PATTERN.matcher(value).replaceAll("--"); + + Matcher matcher = DASHES_DETECT_PATTERN.matcher(value); + if (matcher.find() && matcher.start() >= 0) { + String fixedValue = matcher.replaceFirst("--"); + if (matcher.find()) { + // multiple occurrences --> better do no replacement + return value; } + return unprotectTermsFormatter.format(fixedValue); } - // no replacement + return value; } diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java index 19280858b36..3a933130f3b 100644 --- a/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java @@ -8,6 +8,11 @@ import org.jabref.logic.protectedterms.ProtectedTermsLoader; import org.jabref.logic.util.strings.StringLengthComparator; +/** + * Adds {} brackets around acronyms, month names and countries to preserve their case. + * + * Related formatter: {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter} + */ public class ProtectTermsFormatter extends Formatter { private final ProtectedTermsLoader protectedTermsLoader; diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java new file mode 100644 index 00000000000..cc2d5ab2096 --- /dev/null +++ b/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java @@ -0,0 +1,63 @@ +package org.jabref.logic.formatter.casechanger; + +import java.util.Objects; + +import org.jabref.logic.cleanup.Formatter; +import org.jabref.logic.l10n.Localization; + +/** + * Remove {} braces around words in case they appear balanced + * + * Related formatter: {@link ProtectTermsFormatter} + */ +public class UnprotectTermsFormatter extends Formatter { + + @Override + public String format(String text) { + // similar implementation at {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter.hasNegativeBraceCount} + Objects.requireNonNull(text); + if (text.isEmpty()) { + return text; + } + StringBuilder result = new StringBuilder(); + int level = 0; + int index = 0; + do { + char charAtIndex = text.charAt(index); + if (charAtIndex == '{') { + level++; + } else if (charAtIndex == '}') { + level--; + } else { + result.append(charAtIndex); + } + index++; + } while (index < text.length() && level >= 0); + if (level != 0) { + // in case of unbalanced braces, the original text is returned unmodified + return text; + } + return result.toString(); + } + + @Override + public String getDescription() { + return Localization.lang( + "Removes all balanced {} braces around words."); + } + + @Override + public String getExampleInput() { + return "{In} {CDMA}"; + } + + @Override + public String getName() { + return Localization.lang("Unprotect terms"); + } + + @Override + public String getKey() { + return "unprotect_terms"; + } +} diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java index 59f7e737f85..54fb76f40be 100644 --- a/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java @@ -3,6 +3,9 @@ import org.jabref.logic.cleanup.Formatter; import org.jabref.logic.l10n.Localization; +/** + * Converts all characters of the given string to upper case, but does not change words starting with "{" + */ public class UpperCaseFormatter extends Formatter { @Override @@ -15,9 +18,6 @@ public String getKey() { return "upper_case"; } - /** - * Converts all characters of the given string to upper case, but does not change words starting with "{" - */ @Override public String format(String input) { Title title = new Title(input); diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 8b5b30bc65b..a402c9325b1 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -1430,6 +1430,7 @@ Add\ enclosing\ braces=Add enclosing braces Add\ braces\ encapsulating\ the\ complete\ field\ content.=Add braces encapsulating the complete field content. Remove\ enclosing\ braces=Remove enclosing braces Removes\ braces\ encapsulating\ the\ complete\ field\ content.=Removes braces encapsulating the complete field content. +Removes\ all\ balanced\ {}\ braces\ around\ words.=Removes all balanced {} braces around words. Shorten\ DOI=Shorten DOI Shortens\ DOI\ to\ more\ human\ readable\ form.=Shortens DOI to more human readable form. Sentence\ case=Sentence case @@ -2283,5 +2284,6 @@ Regular\ expression=Regular expression Error\ importing.\ See\ the\ error\ log\ for\ details.=Error importing. See the error log for details. +Unprotect\ terms=Unprotect terms Error\ connecting\ to\ Writer\ document=Error connecting to Writer document You\ need\ to\ open\ Writer\ with\ a\ document\ before\ connecting=You need to open Writer with a document before connecting diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java index b0b019ddbe0..1281c8a5202 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java @@ -1,7 +1,10 @@ package org.jabref.logic.formatter.bibtexfields; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import java.util.stream.Stream; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -10,102 +13,95 @@ */ public class NormalizePagesFormatterTest { - private NormalizePagesFormatter formatter; + private final NormalizePagesFormatter formatter = new NormalizePagesFormatter(); - @BeforeEach - public void setUp() { - formatter = new NormalizePagesFormatter(); - } + private static Stream tests() { + return Stream.of( + // formatSinglePageResultsInNoChange + Arguments.of("1", "1"), - @Test - public void formatSinglePageResultsInNoChange() { - assertEquals("1", formatter.format("1")); - } + // formatPageNumbers + Arguments.of("1--2", "1-2"), - @Test - public void formatPageNumbers() { - assertEquals("1--2", formatter.format("1-2")); - } + // endash + Arguments.of("1--2", "1\u20132"), - @Test - public void formatPageNumbersCommaSeparated() { - assertEquals("1,2,3", formatter.format("1,2,3")); - } + // emdash + Arguments.of("1--2", "1\u20142"), - @Test - public void formatPageNumbersPlusRange() { - assertEquals("43+", formatter.format("43+")); - } + // formatPageNumbersCommaSeparated + Arguments.of("1,2,3", "1,2,3"), - @Test - public void ignoreWhitespaceInPageNumbers() { - assertEquals("1--2", formatter.format(" 1 - 2 ")); - } + // formatPageNumbersPlusRange + Arguments.of("43+", "43+"), - @Test - public void removeWhitespaceSinglePage() { - assertEquals("1", formatter.format(" 1 ")); - } + // ignoreWhitespaceInPageNumbers + Arguments.of("1--2", " 1 - 2 "), - @Test - public void removeWhitespacePageRange() { - assertEquals("1--2", formatter.format(" 1 -- 2 ")); - } + // removeWhitespaceSinglePage + Arguments.of("1", " 1 "), - @Test - public void ignoreWhitespaceInPageNumbersWithDoubleDash() { - assertEquals("43--103", formatter.format("43 -- 103")); - } + // removeWhitespacePageRange + Arguments.of("1--2", " 1 -- 2 "), - @Test - public void keepCorrectlyFormattedPageNumbers() { - assertEquals("1--2", formatter.format("1--2")); - } + // ignoreWhitespaceInPageNumbersWithDoubleDash + Arguments.of("43--103", "43 -- 103"), - @Test - public void formatPageNumbersRemoveUnexpectedLiterals() { - assertEquals("1--2", formatter.format("{1}-{2}")); - } + // keepCorrectlyFormattedPageNumbers + Arguments.of("1--2", "1--2"), - @Test - public void formatPageNumbersRegexNotMatching() { - assertEquals("12", formatter.format("12")); - } + // three dashes get two dashes + Arguments.of("1--2", "1---2"), - @Test - public void doNotRemoveLetters() { - assertEquals("R1-R50", formatter.format("R1-R50")); - } + // formatPageNumbersRemoveUnexpectedLiterals + Arguments.of("1--2", "{1}-{2}"), - @Test - public void replaceLongDashWithDoubleDash() { - assertEquals("1--50", formatter.format("1 \u2014 50")); - } + // formatPageNumbersRegexNotMatching + Arguments.of("12", "12"), - @Test - public void removePagePrefix() { - assertEquals("50", formatter.format("p.50")); - } + // special case, where -- is also put into + Arguments.of("some--text", "some-text"), + Arguments.of("pages 1--50", "pages 1-50"), - @Test - public void removePagesPrefix() { - assertEquals("50", formatter.format("pp.50")); - } + // keep arbitrary text + Arguments.of("some-text-with-dashes", "some-text-with-dashes"), + Arguments.of("{A}", "{A}"), + Arguments.of("43+", "43+"), + Arguments.of("Invalid", "Invalid"), - @Test - public void formatACMPages() { - // This appears in https://doi.org/10.1145/1658373.1658375 - assertEquals("2:1--2:33", formatter.format("2:1-2:33")); - } + // doNotRemoveLetters + Arguments.of("R1--R50", "R1-R50"), + + // replaceLongDashWithDoubleDash + Arguments.of("1--50", "1 \u2014 50"), + + // removePagePrefix + Arguments.of("50", "p.50"), + + // removePagesPrefix + Arguments.of("50", "pp.50"), + + // keep & + Arguments.of("40&50", "40&50"), + + // formatACMPages + // This appears in https://doi.org/10.1145/1658373.1658375 + Arguments.of("2:1--2:33", "2:1-2:33"), + + // keepFormattedACMPages + // This appears in https://doi.org/10.1145/1658373.1658375 + Arguments.of("2:1--2:33", "2:1--2:33"), + + // formatExample + Arguments.of("1--2", new NormalizePagesFormatter().getExampleInput()), - @Test - public void keepFormattedACMPages() { - // This appears in https://doi.org/10.1145/1658373.1658375 - assertEquals("2:1--2:33", formatter.format("2:1--2:33")); + // replaceDashWithMinus + Arguments.of("R404--R405", "R404–R405")); } - @Test - public void formatExample() { - assertEquals("1--2", formatter.format(formatter.getExampleInput())); + @ParameterizedTest + @MethodSource("tests") + public void test(String expected, String input) { + assertEquals(expected, formatter.format(input)); } } diff --git a/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java b/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java new file mode 100644 index 00000000000..28b78052958 --- /dev/null +++ b/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java @@ -0,0 +1,39 @@ +package org.jabref.logic.formatter.casechanger; + +import java.io.IOException; +import java.util.stream.Stream; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests in addition to the general tests from {@link org.jabref.logic.formatter.FormatterTest} + */ +public class UnprotectTermsFormatterTest { + + private UnprotectTermsFormatter formatter = new UnprotectTermsFormatter(); + + private static Stream terms() throws IOException { + return Stream.of( + Arguments.of("VLSI", "{VLSI}"), + Arguments.of("VLsI", "VLsI"), + Arguments.of("VLSI", "VLSI"), + Arguments.of("VLSI VLSI", "{VLSI} {VLSI}"), + Arguments.of("BPEL", "{BPEL}"), + Arguments.of("3GPP 3G", "{3GPP} {3G}"), + Arguments.of("{A} and {B}}", "{A} and {B}}"), + Arguments.of("Testing BPEL Engine Performance: A Survey", "{Testing BPEL Engine Performance: A Survey}"), + Arguments.of("Testing BPEL Engine Performance: A Survey", "Testing {BPEL} Engine Performance: A Survey"), + Arguments.of("Testing BPEL Engine Performance: A Survey", "{Testing {BPEL} Engine Performance: A Survey}"), + Arguments.of("In CDMA", new UnprotectTermsFormatter().getExampleInput())); + } + + @ParameterizedTest + @MethodSource("terms") + public void test(String expected, String toFormat) { + assertEquals(expected, formatter.format(toFormat)); + } +}