From eca13ddb173ba8f49f6324935c90d0db1ccc0001 Mon Sep 17 00:00:00 2001
From: Christoph <siedlerkiller@gmail.com>
Date: Mon, 4 Jan 2021 17:37:23 +0100
Subject: [PATCH] Fix Normalize pages formatter not replacing dashes (#7243)

Co-authored-by: Oliver Kopp <kopp.dev@gmail.com>
---
 CHANGELOG.md                                  |   1 +
 .../bibtexfields/NormalizePagesFormatter.java |  73 +++++----
 .../casechanger/ProtectTermsFormatter.java    |   5 +
 .../casechanger/UnprotectTermsFormatter.java  |  63 +++++++
 .../casechanger/UpperCaseFormatter.java       |   6 +-
 src/main/resources/l10n/JabRef_en.properties  |   2 +
 .../NormalizePagesFormatterTest.java          | 154 +++++++++---------
 .../UnprotectTermsFormatterTest.java          |  39 +++++
 8 files changed, 225 insertions(+), 118 deletions(-)
 create mode 100644 src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java
 create mode 100644 src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32ce3adaeb0..9f1436c4c5b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 
 ### Fixed
 
+- We fixed an issue where the "Normalize page numbers" formatter did not replace en-dashes or em-dashes with a hyphen-minus sign. [#7239](https://github.com/JabRef/jabref/issues/7239)
 - We fixed an issue with the style of highlighted check boxes while searching in preferences. [#7226](https://github.com/JabRef/jabref/issues/7226)
 - We fixed an issue where the option "Move file to file directory" was disabled in the entry editor for all files [#7194](https://github.com/JabRef/jabref/issues/7194)
 - We fixed an issue where application dialogs were opening in the wrong display when using multiple screens [#7273](https://github.com/JabRef/jabref/pull/7273)
diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java
index 8a9163b73f5..8bdb2aef9b4 100644
--- a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java
+++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java
@@ -5,27 +5,41 @@
 import java.util.regex.Pattern;
 
 import org.jabref.logic.cleanup.Formatter;
+import org.jabref.logic.formatter.casechanger.UnprotectTermsFormatter;
 import org.jabref.logic.l10n.Localization;
 
-import com.google.common.base.Strings;
-
 /**
  * This class includes sensible defaults for consistent formatting of BibTeX page numbers.
- *
+ * <p>
+ * Format page numbers, separated either by commas or double-hyphens.
+ * Converts the range number format of the <code>pages</code> field to page_number--page_number.
+ * Removes unwanted literals except letters, numbers and -+ signs.
+ * Keeps the existing String if the resulting field does not match the expected Regex.
+ * <p>
  * From BibTeX manual:
  * One or more page numbers or range of numbers, such as 42--111 or 7,41,73--97 or 43+
  * (the '+' in this last example indicates pages following that don't form a simple range).
  * To make it easier to maintain Scribe-compatible databases, the standard styles convert
  * a single dash (as in 7-33) to the double dash used in TEX to denote number ranges (as in 7--33).
+ * <p>
+ * Examples:
+ *
+ * <ul>
+ *     <li><code>1-2 -> 1--2</code></li>
+ *     <li><code>1---2 -> 1--2</code></li>
+ *     <li><code>1-2 -> 1--2</code></li>
+ *     <li><code>1,2,3 -> 1,2,3</code></li>
+ *     <li><code>{1}-{2} -> 1--2</code></li>
+ *     <li><code>43+ -> 43+</code></li>
+ *     <li>Invalid -> Invalid</li>
+ * </ul>
  */
 public class NormalizePagesFormatter extends Formatter {
 
-    // "startpage" and "endpage" are named groups. See http://stackoverflow.com/a/415635/873282 for a documentation
-    private static final Pattern PAGES_DETECT_PATTERN = Pattern.compile("\\A(?<startpage>(\\d+:)?\\d+)(?:-{1,2}(?<endpage>(\\d+:)?\\d+))?\\Z");
+    private static final Pattern EM_EN_DASH_PATTERN = Pattern.compile("\u2013|\u2014");
+    private static final Pattern DASHES_DETECT_PATTERN = Pattern.compile("[ ]*-+[ ]*");
 
-    private static final String REJECT_LITERALS = "[^a-zA-Z0-9,\\-\\+,:]";
-    private static final String PAGES_REPLACE_PATTERN = "${startpage}--${endpage}";
-    private static final String SINGLE_PAGE_REPLACE_PATTERN = "$1";
+    private final Formatter unprotectTermsFormatter = new UnprotectTermsFormatter();
 
     @Override
     public String getName() {
@@ -37,44 +51,31 @@ public String getKey() {
         return "normalize_page_numbers";
     }
 
-    /**
-     * Format page numbers, separated either by commas or double-hyphens.
-     * Converts the range number format of the <code>pages</code> field to page_number--page_number.
-     * Removes unwanted literals except letters, numbers and -+ signs.
-     * Keeps the existing String if the resulting field does not match the expected Regex.
-     *
-     * <example>
-     *     1-2 -> 1--2
-     *     1,2,3 -> 1,2,3
-     *     {1}-{2} -> 1--2
-     *     43+ -> 43+
-     *     Invalid -> Invalid
-     * </example>
-     */
     @Override
     public String format(String value) {
         Objects.requireNonNull(value);
-
         if (value.isEmpty()) {
-            // nothing to do
             return value;
         }
 
+        value = value.trim();
+
         // Remove pages prefix
-        String cleanValue = value.replace("pp.", "").replace("p.", "");
-        // remove unwanted literals including en dash, em dash, and whitespace
-        cleanValue = cleanValue.replaceAll("\u2013|\u2014", "-").replaceAll(REJECT_LITERALS, "");
-        // try to find pages pattern
-        Matcher matcher = PAGES_DETECT_PATTERN.matcher(cleanValue);
-        if (matcher.matches()) {
-            // replace
-            if (Strings.isNullOrEmpty(matcher.group("endpage"))) {
-                return matcher.replaceFirst(SINGLE_PAGE_REPLACE_PATTERN);
-            } else {
-                return matcher.replaceFirst(PAGES_REPLACE_PATTERN);
+        value = value.replace("pp.", "").replace("p.", "").trim();
+
+        // replace em and en dashes by --
+        value = EM_EN_DASH_PATTERN.matcher(value).replaceAll("--");
+
+        Matcher matcher = DASHES_DETECT_PATTERN.matcher(value);
+        if (matcher.find() && matcher.start() >= 0) {
+            String fixedValue = matcher.replaceFirst("--");
+            if (matcher.find()) {
+                // multiple occurrences --> better do no replacement
+                return value;
             }
+            return unprotectTermsFormatter.format(fixedValue);
         }
-        // no replacement
+
         return value;
     }
 
diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java
index 19280858b36..3a933130f3b 100644
--- a/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java
+++ b/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java
@@ -8,6 +8,11 @@
 import org.jabref.logic.protectedterms.ProtectedTermsLoader;
 import org.jabref.logic.util.strings.StringLengthComparator;
 
+/**
+ * Adds {} brackets around acronyms, month names and countries to preserve their case.
+ *
+ * Related formatter: {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter}
+ */
 public class ProtectTermsFormatter extends Formatter {
 
     private final ProtectedTermsLoader protectedTermsLoader;
diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java
new file mode 100644
index 00000000000..cc2d5ab2096
--- /dev/null
+++ b/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java
@@ -0,0 +1,63 @@
+package org.jabref.logic.formatter.casechanger;
+
+import java.util.Objects;
+
+import org.jabref.logic.cleanup.Formatter;
+import org.jabref.logic.l10n.Localization;
+
+/**
+ * Remove {} braces around words in case they appear balanced
+ *
+ * Related formatter: {@link ProtectTermsFormatter}
+ */
+public class UnprotectTermsFormatter extends Formatter {
+
+    @Override
+    public String format(String text) {
+        // similar implementation at {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter.hasNegativeBraceCount}
+        Objects.requireNonNull(text);
+        if (text.isEmpty()) {
+            return text;
+        }
+        StringBuilder result = new StringBuilder();
+        int level = 0;
+        int index = 0;
+        do {
+            char charAtIndex = text.charAt(index);
+            if (charAtIndex == '{') {
+                level++;
+            } else if (charAtIndex == '}') {
+                level--;
+            } else {
+                result.append(charAtIndex);
+            }
+            index++;
+        } while (index < text.length() && level >= 0);
+        if (level != 0) {
+            // in case of unbalanced braces, the original text is returned unmodified
+            return text;
+        }
+        return result.toString();
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang(
+                "Removes all balanced {} braces around words.");
+    }
+
+    @Override
+    public String getExampleInput() {
+        return "{In} {CDMA}";
+    }
+
+    @Override
+    public String getName() {
+        return Localization.lang("Unprotect terms");
+    }
+
+    @Override
+    public String getKey() {
+        return "unprotect_terms";
+    }
+}
diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java
index 59f7e737f85..54fb76f40be 100644
--- a/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java
+++ b/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java
@@ -3,6 +3,9 @@
 import org.jabref.logic.cleanup.Formatter;
 import org.jabref.logic.l10n.Localization;
 
+/**
+ * Converts all characters of the given string to upper case, but does not change words starting with "{"
+ */
 public class UpperCaseFormatter extends Formatter {
 
     @Override
@@ -15,9 +18,6 @@ public String getKey() {
         return "upper_case";
     }
 
-    /**
-     * Converts all characters of the given string to upper case, but does not change words starting with "{"
-     */
     @Override
     public String format(String input) {
         Title title = new Title(input);
diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
index 8b5b30bc65b..a402c9325b1 100644
--- a/src/main/resources/l10n/JabRef_en.properties
+++ b/src/main/resources/l10n/JabRef_en.properties
@@ -1430,6 +1430,7 @@ Add\ enclosing\ braces=Add enclosing braces
 Add\ braces\ encapsulating\ the\ complete\ field\ content.=Add braces encapsulating the complete field content.
 Remove\ enclosing\ braces=Remove enclosing braces
 Removes\ braces\ encapsulating\ the\ complete\ field\ content.=Removes braces encapsulating the complete field content.
+Removes\ all\ balanced\ {}\ braces\ around\ words.=Removes all balanced {} braces around words.
 Shorten\ DOI=Shorten DOI
 Shortens\ DOI\ to\ more\ human\ readable\ form.=Shortens DOI to more human readable form.
 Sentence\ case=Sentence case
@@ -2283,5 +2284,6 @@ Regular\ expression=Regular expression
 
 Error\ importing.\ See\ the\ error\ log\ for\ details.=Error importing. See the error log for details.
 
+Unprotect\ terms=Unprotect terms
 Error\ connecting\ to\ Writer\ document=Error connecting to Writer document
 You\ need\ to\ open\ Writer\ with\ a\ document\ before\ connecting=You need to open Writer with a document before connecting
diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java
index b0b019ddbe0..1281c8a5202 100644
--- a/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java
+++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatterTest.java
@@ -1,7 +1,10 @@
 package org.jabref.logic.formatter.bibtexfields;
 
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
@@ -10,102 +13,95 @@
  */
 public class NormalizePagesFormatterTest {
 
-    private NormalizePagesFormatter formatter;
+    private final NormalizePagesFormatter formatter = new NormalizePagesFormatter();
 
-    @BeforeEach
-    public void setUp() {
-        formatter = new NormalizePagesFormatter();
-    }
+    private static Stream<Arguments> tests() {
+        return Stream.of(
+                // formatSinglePageResultsInNoChange
+                Arguments.of("1", "1"),
 
-    @Test
-    public void formatSinglePageResultsInNoChange() {
-        assertEquals("1", formatter.format("1"));
-    }
+                // formatPageNumbers
+                Arguments.of("1--2", "1-2"),
 
-    @Test
-    public void formatPageNumbers() {
-        assertEquals("1--2", formatter.format("1-2"));
-    }
+                // endash
+                Arguments.of("1--2", "1\u20132"),
 
-    @Test
-    public void formatPageNumbersCommaSeparated() {
-        assertEquals("1,2,3", formatter.format("1,2,3"));
-    }
+                // emdash
+                Arguments.of("1--2", "1\u20142"),
 
-    @Test
-    public void formatPageNumbersPlusRange() {
-        assertEquals("43+", formatter.format("43+"));
-    }
+                // formatPageNumbersCommaSeparated
+                Arguments.of("1,2,3", "1,2,3"),
 
-    @Test
-    public void ignoreWhitespaceInPageNumbers() {
-        assertEquals("1--2", formatter.format("   1  - 2 "));
-    }
+                // formatPageNumbersPlusRange
+                Arguments.of("43+", "43+"),
 
-    @Test
-    public void removeWhitespaceSinglePage() {
-        assertEquals("1", formatter.format("   1  "));
-    }
+                // ignoreWhitespaceInPageNumbers
+                Arguments.of("1--2", "   1  - 2 "),
 
-    @Test
-    public void removeWhitespacePageRange() {
-        assertEquals("1--2", formatter.format("   1 -- 2  "));
-    }
+                // removeWhitespaceSinglePage
+                Arguments.of("1", "   1  "),
 
-    @Test
-    public void ignoreWhitespaceInPageNumbersWithDoubleDash() {
-        assertEquals("43--103", formatter.format("43 -- 103"));
-    }
+                // removeWhitespacePageRange
+                Arguments.of("1--2", "   1 -- 2  "),
 
-    @Test
-    public void keepCorrectlyFormattedPageNumbers() {
-        assertEquals("1--2", formatter.format("1--2"));
-    }
+                // ignoreWhitespaceInPageNumbersWithDoubleDash
+                Arguments.of("43--103", "43 -- 103"),
 
-    @Test
-    public void formatPageNumbersRemoveUnexpectedLiterals() {
-        assertEquals("1--2", formatter.format("{1}-{2}"));
-    }
+                // keepCorrectlyFormattedPageNumbers
+                Arguments.of("1--2", "1--2"),
 
-    @Test
-    public void formatPageNumbersRegexNotMatching() {
-        assertEquals("12", formatter.format("12"));
-    }
+                // three dashes get two dashes
+                Arguments.of("1--2", "1---2"),
 
-    @Test
-    public void doNotRemoveLetters() {
-        assertEquals("R1-R50", formatter.format("R1-R50"));
-    }
+                // formatPageNumbersRemoveUnexpectedLiterals
+                Arguments.of("1--2", "{1}-{2}"),
 
-    @Test
-    public void replaceLongDashWithDoubleDash() {
-        assertEquals("1--50", formatter.format("1 \u2014 50"));
-    }
+                // formatPageNumbersRegexNotMatching
+                Arguments.of("12", "12"),
 
-    @Test
-    public void removePagePrefix() {
-        assertEquals("50", formatter.format("p.50"));
-    }
+                // special case, where -- is also put into
+                Arguments.of("some--text", "some-text"),
+                Arguments.of("pages 1--50", "pages 1-50"),
 
-    @Test
-    public void removePagesPrefix() {
-        assertEquals("50", formatter.format("pp.50"));
-    }
+                // keep arbitrary text
+                Arguments.of("some-text-with-dashes", "some-text-with-dashes"),
+                Arguments.of("{A}", "{A}"),
+                Arguments.of("43+", "43+"),
+                Arguments.of("Invalid", "Invalid"),
 
-    @Test
-    public void formatACMPages() {
-        // This appears in https://doi.org/10.1145/1658373.1658375
-        assertEquals("2:1--2:33", formatter.format("2:1-2:33"));
-    }
+                // doNotRemoveLetters
+                Arguments.of("R1--R50", "R1-R50"),
+
+                // replaceLongDashWithDoubleDash
+                Arguments.of("1--50", "1 \u2014 50"),
+
+                // removePagePrefix
+                Arguments.of("50", "p.50"),
+
+                // removePagesPrefix
+                Arguments.of("50", "pp.50"),
+
+                // keep &
+                Arguments.of("40&50", "40&50"),
+
+                // formatACMPages
+                // This appears in https://doi.org/10.1145/1658373.1658375
+                Arguments.of("2:1--2:33", "2:1-2:33"),
+
+                // keepFormattedACMPages
+                // This appears in https://doi.org/10.1145/1658373.1658375
+                Arguments.of("2:1--2:33", "2:1--2:33"),
+
+                // formatExample
+                Arguments.of("1--2", new NormalizePagesFormatter().getExampleInput()),
 
-    @Test
-    public void keepFormattedACMPages() {
-        // This appears in https://doi.org/10.1145/1658373.1658375
-        assertEquals("2:1--2:33", formatter.format("2:1--2:33"));
+                // replaceDashWithMinus
+                Arguments.of("R404--R405", "R404–R405"));
     }
 
-    @Test
-    public void formatExample() {
-        assertEquals("1--2", formatter.format(formatter.getExampleInput()));
+    @ParameterizedTest
+    @MethodSource("tests")
+    public void test(String expected, String input) {
+        assertEquals(expected, formatter.format(input));
     }
 }
diff --git a/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java b/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java
new file mode 100644
index 00000000000..28b78052958
--- /dev/null
+++ b/src/test/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatterTest.java
@@ -0,0 +1,39 @@
+package org.jabref.logic.formatter.casechanger;
+
+import java.io.IOException;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Tests in addition to the general tests from {@link org.jabref.logic.formatter.FormatterTest}
+ */
+public class UnprotectTermsFormatterTest {
+
+    private UnprotectTermsFormatter formatter = new UnprotectTermsFormatter();
+
+    private static Stream<Arguments> terms() throws IOException {
+        return Stream.of(
+                Arguments.of("VLSI", "{VLSI}"),
+                Arguments.of("VLsI", "VLsI"),
+                Arguments.of("VLSI", "VLSI"),
+                Arguments.of("VLSI VLSI", "{VLSI} {VLSI}"),
+                Arguments.of("BPEL", "{BPEL}"),
+                Arguments.of("3GPP 3G", "{3GPP} {3G}"),
+                Arguments.of("{A} and {B}}", "{A} and {B}}"),
+                Arguments.of("Testing BPEL Engine Performance: A Survey", "{Testing BPEL Engine Performance: A Survey}"),
+                Arguments.of("Testing BPEL Engine Performance: A Survey", "Testing {BPEL} Engine Performance: A Survey"),
+                Arguments.of("Testing BPEL Engine Performance: A Survey", "{Testing {BPEL} Engine Performance: A Survey}"),
+                Arguments.of("In CDMA", new UnprotectTermsFormatter().getExampleInput()));
+    }
+
+    @ParameterizedTest
+    @MethodSource("terms")
+    public void test(String expected, String toFormat) {
+        assertEquals(expected, formatter.format(toFormat));
+    }
+}