Fix Normalize pages formatter not replacing dashes (#7243)

Co-authored-by: Oliver Kopp <kopp.dev@gmail.com>
JabRef · Jan 4, 2021 · eca13dd · eca13dd
1 parent 5ba38da
commit eca13dd
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 118 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 
 ### Fixed
 
+- We fixed an issue where the "Normalize page numbers" formatter did not replace en-dashes or em-dashes with a hyphen-minus sign. [#7239](https://github.com/JabRef/jabref/issues/7239)
 - We fixed an issue with the style of highlighted check boxes while searching in preferences. [#7226](https://github.com/JabRef/jabref/issues/7226)
 - We fixed an issue where the option "Move file to file directory" was disabled in the entry editor for all files [#7194](https://github.com/JabRef/jabref/issues/7194)
 - We fixed an issue where application dialogs were opening in the wrong display when using multiple screens [#7273](https://github.com/JabRef/jabref/pull/7273)

diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizePagesFormatter.java
@@ -5,27 +5,41 @@
 import java.util.regex.Pattern;
 
 import org.jabref.logic.cleanup.Formatter;
+import org.jabref.logic.formatter.casechanger.UnprotectTermsFormatter;
 import org.jabref.logic.l10n.Localization;
 
-import com.google.common.base.Strings;
-
 /**
  * This class includes sensible defaults for consistent formatting of BibTeX page numbers.
- *
+ * <p>
+ * Format page numbers, separated either by commas or double-hyphens.
+ * Converts the range number format of the <code>pages</code> field to page_number--page_number.
+ * Removes unwanted literals except letters, numbers and -+ signs.
+ * Keeps the existing String if the resulting field does not match the expected Regex.
+ * <p>
  * From BibTeX manual:
  * One or more page numbers or range of numbers, such as 42--111 or 7,41,73--97 or 43+
  * (the '+' in this last example indicates pages following that don't form a simple range).
  * To make it easier to maintain Scribe-compatible databases, the standard styles convert
  * a single dash (as in 7-33) to the double dash used in TEX to denote number ranges (as in 7--33).
+ * <p>
+ * Examples:
+ *
+ * <ul>
+ *     <li><code>1-2 -> 1--2</code></li>
+ *     <li><code>1---2 -> 1--2</code></li>
+ *     <li><code>1-2 -> 1--2</code></li>
+ *     <li><code>1,2,3 -> 1,2,3</code></li>
+ *     <li><code>{1}-{2} -> 1--2</code></li>
+ *     <li><code>43+ -> 43+</code></li>
+ *     <li>Invalid -> Invalid</li>
+ * </ul>
  */
 public class NormalizePagesFormatter extends Formatter {
 
-    // "startpage" and "endpage" are named groups. See http://stackoverflow.com/a/415635/873282 for a documentation
-    private static final Pattern PAGES_DETECT_PATTERN = Pattern.compile("\\A(?<startpage>(\\d+:)?\\d+)(?:-{1,2}(?<endpage>(\\d+:)?\\d+))?\\Z");
+    private static final Pattern EM_EN_DASH_PATTERN = Pattern.compile("\u2013|\u2014");
+    private static final Pattern DASHES_DETECT_PATTERN = Pattern.compile("[ ]*-+[ ]*");
 
-    private static final String REJECT_LITERALS = "[^a-zA-Z0-9,\\-\\+,:]";
-    private static final String PAGES_REPLACE_PATTERN = "${startpage}--${endpage}";
-    private static final String SINGLE_PAGE_REPLACE_PATTERN = "$1";
+    private final Formatter unprotectTermsFormatter = new UnprotectTermsFormatter();
 
     @Override
     public String getName() {
@@ -37,44 +51,31 @@ public String getKey() {
         return "normalize_page_numbers";
     }
 
-    /**
-     * Format page numbers, separated either by commas or double-hyphens.
-     * Converts the range number format of the <code>pages</code> field to page_number--page_number.
-     * Removes unwanted literals except letters, numbers and -+ signs.
-     * Keeps the existing String if the resulting field does not match the expected Regex.
-     *
-     * <example>
-     *     1-2 -> 1--2
-     *     1,2,3 -> 1,2,3
-     *     {1}-{2} -> 1--2
-     *     43+ -> 43+
-     *     Invalid -> Invalid
-     * </example>
-     */
     @Override
     public String format(String value) {
         Objects.requireNonNull(value);
-
         if (value.isEmpty()) {
-            // nothing to do
             return value;
         }
 
+        value = value.trim();
+
         // Remove pages prefix
-        String cleanValue = value.replace("pp.", "").replace("p.", "");
-        // remove unwanted literals including en dash, em dash, and whitespace
-        cleanValue = cleanValue.replaceAll("\u2013|\u2014", "-").replaceAll(REJECT_LITERALS, "");
-        // try to find pages pattern
-        Matcher matcher = PAGES_DETECT_PATTERN.matcher(cleanValue);
-        if (matcher.matches()) {
-            // replace
-            if (Strings.isNullOrEmpty(matcher.group("endpage"))) {
-                return matcher.replaceFirst(SINGLE_PAGE_REPLACE_PATTERN);
-            } else {
-                return matcher.replaceFirst(PAGES_REPLACE_PATTERN);
+        value = value.replace("pp.", "").replace("p.", "").trim();
+
+        // replace em and en dashes by --
+        value = EM_EN_DASH_PATTERN.matcher(value).replaceAll("--");
+
+        Matcher matcher = DASHES_DETECT_PATTERN.matcher(value);
+        if (matcher.find() && matcher.start() >= 0) {
+            String fixedValue = matcher.replaceFirst("--");
+            if (matcher.find()) {
+                // multiple occurrences --> better do no replacement
+                return value;
             }
+            return unprotectTermsFormatter.format(fixedValue);
         }
-        // no replacement
+
         return value;
     }
 

diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/ProtectTermsFormatter.java
@@ -8,6 +8,11 @@
 import org.jabref.logic.protectedterms.ProtectedTermsLoader;
 import org.jabref.logic.util.strings.StringLengthComparator;
 
+/**
+ * Adds {} brackets around acronyms, month names and countries to preserve their case.
+ *
+ * Related formatter: {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter}
+ */
 public class ProtectTermsFormatter extends Formatter {
 
     private final ProtectedTermsLoader protectedTermsLoader;

diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UnprotectTermsFormatter.java
@@ -0,0 +1,63 @@
+package org.jabref.logic.formatter.casechanger;
+
+import java.util.Objects;
+
+import org.jabref.logic.cleanup.Formatter;
+import org.jabref.logic.l10n.Localization;
+
+/**
+ * Remove {} braces around words in case they appear balanced
+ *
+ * Related formatter: {@link ProtectTermsFormatter}
+ */
+public class UnprotectTermsFormatter extends Formatter {
+
+    @Override
+    public String format(String text) {
+        // similar implementation at {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter.hasNegativeBraceCount}
+        Objects.requireNonNull(text);
+        if (text.isEmpty()) {
+            return text;
+        }
+        StringBuilder result = new StringBuilder();
+        int level = 0;
+        int index = 0;
+        do {
+            char charAtIndex = text.charAt(index);
+            if (charAtIndex == '{') {
+                level++;
+            } else if (charAtIndex == '}') {
+                level--;
+            } else {
+                result.append(charAtIndex);
+            }
+            index++;
+        } while (index < text.length() && level >= 0);
+        if (level != 0) {
+            // in case of unbalanced braces, the original text is returned unmodified
+            return text;
+        }
+        return result.toString();
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang(
+                "Removes all balanced {} braces around words.");
+    }
+
+    @Override
+    public String getExampleInput() {
+        return "{In} {CDMA}";
+    }
+
+    @Override
+    public String getName() {
+        return Localization.lang("Unprotect terms");
+    }
+
+    @Override
+    public String getKey() {
+        return "unprotect_terms";
+    }
+}
diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/UpperCaseFormatter.java
@@ -3,6 +3,9 @@
 import org.jabref.logic.cleanup.Formatter;
 import org.jabref.logic.l10n.Localization;
 
+/**
+ * Converts all characters of the given string to upper case, but does not change words starting with "{"
+ */
 public class UpperCaseFormatter extends Formatter {
 
     @Override
@@ -15,9 +18,6 @@ public String getKey() {
         return "upper_case";
     }
 
-    /**
-     * Converts all characters of the given string to upper case, but does not change words starting with "{"
-     */
     @Override
     public String format(String input) {
         Title title = new Title(input);

diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
@@ -1430,6 +1430,7 @@ Add\ enclosing\ braces=Add enclosing braces
 Add\ braces\ encapsulating\ the\ complete\ field\ content.=Add braces encapsulating the complete field content.
 Remove\ enclosing\ braces=Remove enclosing braces
 Removes\ braces\ encapsulating\ the\ complete\ field\ content.=Removes braces encapsulating the complete field content.
+Removes\ all\ balanced\ {}\ braces\ around\ words.=Removes all balanced {} braces around words.
 Shorten\ DOI=Shorten DOI
 Shortens\ DOI\ to\ more\ human\ readable\ form.=Shortens DOI to more human readable form.
 Sentence\ case=Sentence case
@@ -2283,5 +2284,6 @@ Regular\ expression=Regular expression
 
 Error\ importing.\ See\ the\ error\ log\ for\ details.=Error importing. See the error log for details.
 
+Unprotect\ terms=Unprotect terms
 Error\ connecting\ to\ Writer\ document=Error connecting to Writer document
 You\ need\ to\ open\ Writer\ with\ a\ document\ before\ connecting=You need to open Writer with a document before connecting