Skip to content

Commit

Permalink
Fix Normalize pages formatter not replacing dashes (#7243)
Browse files Browse the repository at this point in the history
Co-authored-by: Oliver Kopp <kopp.dev@gmail.com>
  • Loading branch information
Siedlerchr and koppor authored Jan 4, 2021
1 parent 5ba38da commit eca13dd
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 118 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Fixed

- We fixed an issue where the "Normalize page numbers" formatter did not replace en-dashes or em-dashes with a hyphen-minus sign. [#7239](https://github.com/JabRef/jabref/issues/7239)
- We fixed an issue with the style of highlighted check boxes while searching in preferences. [#7226](https://github.com/JabRef/jabref/issues/7226)
- We fixed an issue where the option "Move file to file directory" was disabled in the entry editor for all files [#7194](https://github.com/JabRef/jabref/issues/7194)
- We fixed an issue where application dialogs were opening in the wrong display when using multiple screens [#7273](https://github.com/JabRef/jabref/pull/7273)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,41 @@
import java.util.regex.Pattern;

import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.formatter.casechanger.UnprotectTermsFormatter;
import org.jabref.logic.l10n.Localization;

import com.google.common.base.Strings;

/**
* This class includes sensible defaults for consistent formatting of BibTeX page numbers.
*
* <p>
* Format page numbers, separated either by commas or double-hyphens.
* Converts the range number format of the <code>pages</code> field to page_number--page_number.
* Removes unwanted literals except letters, numbers and -+ signs.
* Keeps the existing String if the resulting field does not match the expected Regex.
* <p>
* From BibTeX manual:
* One or more page numbers or range of numbers, such as 42--111 or 7,41,73--97 or 43+
* (the '+' in this last example indicates pages following that don't form a simple range).
* To make it easier to maintain Scribe-compatible databases, the standard styles convert
* a single dash (as in 7-33) to the double dash used in TEX to denote number ranges (as in 7--33).
* <p>
* Examples:
*
* <ul>
* <li><code>1-2 -> 1--2</code></li>
* <li><code>1---2 -> 1--2</code></li>
* <li><code>1-2 -> 1--2</code></li>
* <li><code>1,2,3 -> 1,2,3</code></li>
* <li><code>{1}-{2} -> 1--2</code></li>
* <li><code>43+ -> 43+</code></li>
* <li>Invalid -> Invalid</li>
* </ul>
*/
public class NormalizePagesFormatter extends Formatter {

// "startpage" and "endpage" are named groups. See http://stackoverflow.com/a/415635/873282 for a documentation
private static final Pattern PAGES_DETECT_PATTERN = Pattern.compile("\\A(?<startpage>(\\d+:)?\\d+)(?:-{1,2}(?<endpage>(\\d+:)?\\d+))?\\Z");
private static final Pattern EM_EN_DASH_PATTERN = Pattern.compile("\u2013|\u2014");
private static final Pattern DASHES_DETECT_PATTERN = Pattern.compile("[ ]*-+[ ]*");

private static final String REJECT_LITERALS = "[^a-zA-Z0-9,\\-\\+,:]";
private static final String PAGES_REPLACE_PATTERN = "${startpage}--${endpage}";
private static final String SINGLE_PAGE_REPLACE_PATTERN = "$1";
private final Formatter unprotectTermsFormatter = new UnprotectTermsFormatter();

@Override
public String getName() {
Expand All @@ -37,44 +51,31 @@ public String getKey() {
return "normalize_page_numbers";
}

/**
* Format page numbers, separated either by commas or double-hyphens.
* Converts the range number format of the <code>pages</code> field to page_number--page_number.
* Removes unwanted literals except letters, numbers and -+ signs.
* Keeps the existing String if the resulting field does not match the expected Regex.
*
* <example>
* 1-2 -> 1--2
* 1,2,3 -> 1,2,3
* {1}-{2} -> 1--2
* 43+ -> 43+
* Invalid -> Invalid
* </example>
*/
@Override
public String format(String value) {
Objects.requireNonNull(value);

if (value.isEmpty()) {
// nothing to do
return value;
}

value = value.trim();

// Remove pages prefix
String cleanValue = value.replace("pp.", "").replace("p.", "");
// remove unwanted literals including en dash, em dash, and whitespace
cleanValue = cleanValue.replaceAll("\u2013|\u2014", "-").replaceAll(REJECT_LITERALS, "");
// try to find pages pattern
Matcher matcher = PAGES_DETECT_PATTERN.matcher(cleanValue);
if (matcher.matches()) {
// replace
if (Strings.isNullOrEmpty(matcher.group("endpage"))) {
return matcher.replaceFirst(SINGLE_PAGE_REPLACE_PATTERN);
} else {
return matcher.replaceFirst(PAGES_REPLACE_PATTERN);
value = value.replace("pp.", "").replace("p.", "").trim();

// replace em and en dashes by --
value = EM_EN_DASH_PATTERN.matcher(value).replaceAll("--");

Matcher matcher = DASHES_DETECT_PATTERN.matcher(value);
if (matcher.find() && matcher.start() >= 0) {
String fixedValue = matcher.replaceFirst("--");
if (matcher.find()) {
// multiple occurrences --> better do no replacement
return value;
}
return unprotectTermsFormatter.format(fixedValue);
}
// no replacement

return value;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
import org.jabref.logic.protectedterms.ProtectedTermsLoader;
import org.jabref.logic.util.strings.StringLengthComparator;

/**
* Adds {} brackets around acronyms, month names and countries to preserve their case.
*
* Related formatter: {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter}
*/
public class ProtectTermsFormatter extends Formatter {

private final ProtectedTermsLoader protectedTermsLoader;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package org.jabref.logic.formatter.casechanger;

import java.util.Objects;

import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.l10n.Localization;

/**
* Remove {} braces around words in case they appear balanced
*
* Related formatter: {@link ProtectTermsFormatter}
*/
public class UnprotectTermsFormatter extends Formatter {

@Override
public String format(String text) {
// similar implementation at {@link org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter.hasNegativeBraceCount}
Objects.requireNonNull(text);
if (text.isEmpty()) {
return text;
}
StringBuilder result = new StringBuilder();
int level = 0;
int index = 0;
do {
char charAtIndex = text.charAt(index);
if (charAtIndex == '{') {
level++;
} else if (charAtIndex == '}') {
level--;
} else {
result.append(charAtIndex);
}
index++;
} while (index < text.length() && level >= 0);
if (level != 0) {
// in case of unbalanced braces, the original text is returned unmodified
return text;
}
return result.toString();
}

@Override
public String getDescription() {
return Localization.lang(
"Removes all balanced {} braces around words.");
}

@Override
public String getExampleInput() {
return "{In} {CDMA}";
}

@Override
public String getName() {
return Localization.lang("Unprotect terms");
}

@Override
public String getKey() {
return "unprotect_terms";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.l10n.Localization;

/**
* Converts all characters of the given string to upper case, but does not change words starting with "{"
*/
public class UpperCaseFormatter extends Formatter {

@Override
Expand All @@ -15,9 +18,6 @@ public String getKey() {
return "upper_case";
}

/**
* Converts all characters of the given string to upper case, but does not change words starting with "{"
*/
@Override
public String format(String input) {
Title title = new Title(input);
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1430,6 +1430,7 @@ Add\ enclosing\ braces=Add enclosing braces
Add\ braces\ encapsulating\ the\ complete\ field\ content.=Add braces encapsulating the complete field content.
Remove\ enclosing\ braces=Remove enclosing braces
Removes\ braces\ encapsulating\ the\ complete\ field\ content.=Removes braces encapsulating the complete field content.
Removes\ all\ balanced\ {}\ braces\ around\ words.=Removes all balanced {} braces around words.
Shorten\ DOI=Shorten DOI
Shortens\ DOI\ to\ more\ human\ readable\ form.=Shortens DOI to more human readable form.
Sentence\ case=Sentence case
Expand Down Expand Up @@ -2283,5 +2284,6 @@ Regular\ expression=Regular expression
Error\ importing.\ See\ the\ error\ log\ for\ details.=Error importing. See the error log for details.
Unprotect\ terms=Unprotect terms
Error\ connecting\ to\ Writer\ document=Error connecting to Writer document
You\ need\ to\ open\ Writer\ with\ a\ document\ before\ connecting=You need to open Writer with a document before connecting
Loading

0 comments on commit eca13dd

Please sign in to comment.