From d0abcfe34fda67224bd92b7b5096a34072e55c9f Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 21 Apr 2023 15:15:43 +0200 Subject: [PATCH] hunspell: disallow hidden title-case entries from compound middle/end (#12220) if we only have custom-case uART and capitalized UART, we shouldn't accept StandUart as a compound (although we keep hidden "Uart" dictionary entries for internal purposes) --- lucene/CHANGES.txt | 2 + .../lucene/analysis/hunspell/Hunspell.java | 4 +- .../analysis/hunspell/TestSpellChecking.java | 4 ++ .../analysis/hunspell/germanManualCase.aff | 51 +++++++++++++++++++ .../analysis/hunspell/germanManualCase.dic | 5 ++ .../analysis/hunspell/germanManualCase.good | 3 ++ .../analysis/hunspell/germanManualCase.wrong | 3 ++ 7 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 05d6b6425a4a..9a8bf4b63643 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -103,6 +103,8 @@ Documentation * GITHUB#10633: Update javadocs in TestBackwardsCompatibility to use gradle and not ant. (Usman Shaikh) +* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end + Other --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index 998bfc774291..1e2a1add13cd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -164,7 +164,7 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { Root findStem( char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { checkCanceled.run(); - boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END; + WordCase toCheck = context != COMPOUND_MIDDLE && context != COMPOUND_END ? originalCase : null; @SuppressWarnings({"rawtypes", "unchecked"}) Root[] result = new Root[1]; stemmer.doStem( @@ -173,7 +173,7 @@ Root findStem( length, context, (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> { - if (checkCase && !acceptCase(originalCase, formID, stem)) { + if (!acceptCase(toCheck, formID, stem)) { return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG); } if (acceptsStem(formID)) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java index ced26c943954..d05ccea33ae2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java @@ -205,6 +205,10 @@ public void testGermanCompounding() throws Exception { doTest("germancompounding"); } + public void testGermanManualCase() throws Exception { + doTest("germanManualCase"); + } + public void testApplyOconvToSuggestions() throws Exception { doTest("oconv"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff new file mode 100644 index 000000000000..2946e76f50ee --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff @@ -0,0 +1,51 @@ +# no CHECKCOMPOUNDCASE + +# compound flags + +COMPOUNDBEGIN U +COMPOUNDMIDDLE V +COMPOUNDEND W + +ONLYINCOMPOUND X +COMPOUNDPERMITFLAG P + +COMPOUNDMIN 1 +WORDCHARS - + +# dash prefix for compounds with dash (Arbeits-Computer) + +PFX - Y 1 +PFX - 0 -/P . + +# decapitalizing prefix + +PFX D Y 29 +PFX D A a/PX A +PFX D ? ?/PX ? +PFX D B b/PX B +PFX D C c/PX C +PFX D D d/PX D +PFX D E e/PX E +PFX D F f/PX F +PFX D G g/PX G +PFX D H h/PX H +PFX D I i/PX I +PFX D J j/PX J +PFX D K k/PX K +PFX D L l/PX L +PFX D M m/PX M +PFX D N n/PX N +PFX D O o/PX O +PFX D ? ?/PX ? +PFX D P p/PX P +PFX D Q q/PX Q +PFX D R r/PX R +PFX D S s/PX S +PFX D T t/PX T +PFX D U u/PX U +PFX D ? ?/PX ? +PFX D V v/PX V +PFX D W w/PX W +PFX D X x/PX X +PFX D Y y/PX Y +PFX D Z z/PX Z diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic new file mode 100644 index 000000000000..5e075003c9a9 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic @@ -0,0 +1,5 @@ +4 +uART/XW- +bein/XW- +Stand/UX +UART/- \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good new file mode 100644 index 000000000000..27c6941024fc --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good @@ -0,0 +1,3 @@ +UART +Standbein +Stand-uART diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong new file mode 100644 index 000000000000..c3ce031400c8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong @@ -0,0 +1,3 @@ +StandUart +uART +Uart