From 4842e0c9cab35239b6d1da619b94a640e7227510 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Fri, 5 Mar 2021 14:34:36 +0100
Subject: [PATCH 1/8] LUCENE-9825: Hunspell: reverse the "words" trie for
 faster word lookup/suggestions

---
 .../lucene/analysis/hunspell/Dictionary.java  | 123 +------
 .../hunspell/GeneratingSuggester.java         |  88 ++---
 .../analysis/hunspell/ModifyingSuggester.java |   2 +
 .../lucene/analysis/hunspell/Stemmer.java     |   4 +
 .../lucene/analysis/hunspell/WordStorage.java | 338 ++++++++++++++++++
 .../hunspell/TestAllDictionaries.java         |   3 +-
 .../analysis/hunspell/TestPerformance.java    |   6 +-
 7 files changed, 396 insertions(+), 168 deletions(-)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 450c77da942a..2672addd88c0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -16,7 +16,8 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
-import static org.apache.lucene.analysis.hunspell.AffixKind.*;
+import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
+import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
@@ -53,8 +54,6 @@
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -91,14 +90,8 @@ public class Dictionary {
    */
   ArrayList<AffixCondition> patterns = new ArrayList<>();
 
-  /**
-   * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
-   * for flagLookup.
-   */
-  FST<IntsRef> words;
-
-  /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
-  FixedBitSet wordHashes;
+  /** The entries in the .dic file, mapping to their set of flags */
+  WordStorage words;
 
   /**
    * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
@@ -257,9 +250,8 @@ public void close() {
       // read dictionary entries
       IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
       int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
-      wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
       String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
-      words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
+      words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
       flagLookup = flagEnumerator.finish();
       aliases = null; // no longer needed
       morphAliases = null; // no longer needed
@@ -272,36 +264,27 @@ int formStep() {
 
   /** Looks up Hunspell word forms from the dictionary */
   IntsRef lookupWord(char[] word, int offset, int length) {
-    int hash = CharsRef.stringHashCode(word, offset, length);
-    if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
-      return null;
-    }
-
-    return lookup(words, word, offset, length);
+    return words.lookupWord(word, offset, length);
   }
 
   // only for testing
   IntsRef lookupPrefix(char[] word) {
-    return lookup(prefixes, word, 0, word.length);
+    return lookup(prefixes, word);
   }
 
   // only for testing
   IntsRef lookupSuffix(char[] word) {
-    return lookup(suffixes, word, 0, word.length);
+    return lookup(suffixes, word);
   }
 
-  IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
-    if (fst == null) {
-      return null;
-    }
+  private IntsRef lookup(FST<IntsRef> fst, char[] word) {
     final FST.BytesReader bytesReader = fst.getBytesReader();
     final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
     // Accumulate output as we go
     IntsRef output = fst.outputs.getNoOutput();
 
-    int l = offset + length;
-    for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
-      cp = Character.codePointAt(word, i, l);
+    for (int i = 0, cp; i < word.length; i += Character.charCount(cp)) {
+      cp = Character.codePointAt(word, i, word.length);
       output = nextArc(fst, arc, bytesReader, output, cp);
       if (output == null) {
         return null;
@@ -1134,13 +1117,13 @@ public int compare(BytesRef o1, BytesRef o2) {
     return sorted;
   }
 
-  private FST<IntsRef> readSortedDictionaries(
-      Directory tempDir, String sorted, FlagEnumerator flags) throws IOException {
+  private WordStorage readSortedDictionaries(
+      Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
     boolean success = false;
 
     Map<String, Integer> morphIndices = new HashMap<>();
 
-    EntryGrouper grouper = new EntryGrouper(flags);
+    WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags);
 
     try (ByteSequencesReader reader =
         new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@@ -1180,6 +1163,8 @@ private FST<IntsRef> readSortedDictionaries(
           entry = line.substring(0, flagSep);
         }
 
+        if (entry.isEmpty()) continue;
+
         int morphDataID = 0;
         if (end + 1 < line.length()) {
           List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
@@ -1189,14 +1174,12 @@ private FST<IntsRef> readSortedDictionaries(
           }
         }
 
-        wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
-        grouper.add(entry, wordForm, morphDataID);
+        builder.add(entry, wordForm, morphDataID);
       }
 
       // finalize last entry
-      grouper.flushGroup();
       success = true;
-      return grouper.words.compile();
+      return builder.build();
     } finally {
       if (success) {
         tempDir.deleteFile(sorted);
@@ -1275,76 +1258,6 @@ boolean isDotICaseChangeDisallowed(char[] word) {
     return word[0] == 'İ' && !alternateCasing;
   }
 
-  private class EntryGrouper {
-    final FSTCompiler<IntsRef> words =
-        new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
-    private final List<char[]> group = new ArrayList<>();
-    private final List<Integer> morphDataIDs = new ArrayList<>();
-    private final IntsRefBuilder scratchInts = new IntsRefBuilder();
-    private String currentEntry = null;
-    private final FlagEnumerator flagEnumerator;
-
-    EntryGrouper(FlagEnumerator flagEnumerator) {
-      this.flagEnumerator = flagEnumerator;
-    }
-
-    void add(String entry, char[] flags, int morphDataID) throws IOException {
-      if (!entry.equals(currentEntry)) {
-        if (currentEntry != null) {
-          if (entry.compareTo(currentEntry) < 0) {
-            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
-          }
-          flushGroup();
-        }
-        currentEntry = entry;
-      }
-
-      group.add(flags);
-      if (hasCustomMorphData) {
-        morphDataIDs.add(morphDataID);
-      }
-    }
-
-    void flushGroup() throws IOException {
-      IntsRefBuilder currentOrds = new IntsRefBuilder();
-
-      boolean hasNonHidden = false;
-      for (char[] flags : group) {
-        if (!hasHiddenFlag(flags)) {
-          hasNonHidden = true;
-          break;
-        }
-      }
-
-      for (int i = 0; i < group.size(); i++) {
-        char[] flags = group.get(i);
-        if (hasNonHidden && hasHiddenFlag(flags)) {
-          continue;
-        }
-
-        currentOrds.append(flagEnumerator.add(flags));
-        if (hasCustomMorphData) {
-          currentOrds.append(morphDataIDs.get(i));
-        }
-      }
-
-      Util.toUTF32(currentEntry, scratchInts);
-      words.add(scratchInts.get(), currentOrds.get());
-
-      group.clear();
-      morphDataIDs.clear();
-    }
-  }
-
-  private static boolean hasHiddenFlag(char[] flags) {
-    for (char flag : flags) {
-      if (flag == HIDDEN_FLAG) {
-        return true;
-      }
-    }
-    return false;
-  }
-
   private void parseAlias(String line) {
     String[] ruleArgs = line.split("\\s+");
     if (aliases == null) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 0b68bce5c35b..3c508ba5f108 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -20,7 +20,6 @@
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
 
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.LinkedHashSet;
@@ -30,11 +29,8 @@
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.stream.Collectors;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.IntsRefFSTEnum;
-import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
 
 /**
  * A class that traverses the entire dictionary and applies affix rules to check if those yield
@@ -68,66 +64,42 @@ private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
     boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
     TrigramAutomaton automaton = new TrigramAutomaton(word);
 
-    IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words);
-    InputOutput<IntsRef> mapping;
-    while ((mapping = nextKey(fstEnum, word.length() + 4)) != null) {
-      speller.checkCanceled.run();
+    dictionary.words.processAllWords(
+        word.length() + 4,
+        (rootChars, forms) -> {
+          speller.checkCanceled.run();
 
-      IntsRef key = mapping.input;
-      assert key.length > 0;
-      if (Math.abs(key.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
-        assert key.length < word.length(); // nextKey takes care of longer keys
-        continue;
-      }
-
-      String root = toString(key);
-      filterSuitableEntries(root, mapping.output, entries);
-      if (entries.isEmpty()) continue;
+          assert rootChars.length > 0;
+          if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
+            assert rootChars.length < word.length(); // nextKey takes care of longer keys
+            return;
+          }
 
-      if (ignoreTitleCaseRoots && WordCase.caseOf(root) == WordCase.TITLE) {
-        continue;
-      }
+          String root = rootChars.toString();
+          filterSuitableEntries(root, forms, entries);
+          if (entries.isEmpty()) return;
 
-      String lower = dictionary.toLowerCase(root);
-      int sc =
-          automaton.ngramScore(lower) - longerWorsePenalty(word, lower) + commonPrefix(word, root);
+          if (ignoreTitleCaseRoots && WordCase.caseOf(rootChars) == WordCase.TITLE) {
+            return;
+          }
 
-      if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
-        continue;
-      }
+          String lower = dictionary.toLowerCase(root);
+          int sc =
+              automaton.ngramScore(lower)
+                  - longerWorsePenalty(word, lower)
+                  + commonPrefix(word, root);
 
-      entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
-      while (roots.size() > MAX_ROOTS) {
-        roots.poll();
-      }
-    }
-    return roots.stream().sorted().collect(Collectors.toList());
-  }
+          if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
+            return;
+          }
 
-  private static InputOutput<IntsRef> nextKey(IntsRefFSTEnum<IntsRef> fstEnum, int maxLen) {
-    try {
-      InputOutput<IntsRef> next = fstEnum.next();
-      while (next != null && next.input.length > maxLen) {
-        int offset = next.input.offset;
-        int[] ints = ArrayUtil.copyOfSubArray(next.input.ints, offset, offset + maxLen);
-        if (ints[ints.length - 1] == Integer.MAX_VALUE) {
-          throw new AssertionError("Too large char");
-        }
-        ints[ints.length - 1]++;
-        next = fstEnum.seekCeil(new IntsRef(ints, 0, ints.length));
-      }
-      return next;
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
+          entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+          while (roots.size() > MAX_ROOTS) {
+            roots.poll();
+          }
+        });
 
-  private static String toString(IntsRef key) {
-    char[] chars = new char[key.length];
-    for (int i = 0; i < key.length; i++) {
-      chars[i] = (char) key.ints[i + key.offset];
-    }
-    return new String(chars);
+    return roots.stream().sorted().collect(Collectors.toList());
   }
 
   private void filterSuitableEntries(String word, IntsRef forms, List<Root<String>> result) {
@@ -363,7 +335,7 @@ private List<String> getMostRelevantSuggestions(
     return result;
   }
 
-  private static int commonPrefix(String s1, String s2) {
+  static int commonPrefix(String s1, String s2) {
     int i = 0;
     int limit = Math.min(s1.length(), s2.length());
     while (i < limit && s1.charAt(i) == s2.charAt(i)) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 5e292745d56c..e135fb6ed0f2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -234,6 +234,8 @@ private void tryLongSwap(String word) {
   }
 
   private void tryRemovingChar(String word) {
+    if (word.length() == 1) return;
+
     for (int i = 0; i < word.length(); i++) {
       trySuggestion(word.substring(0, i) + word.substring(i + 1));
     }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index f864dee2db34..40ad17c609fd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -94,6 +94,10 @@ public List<CharsRef> stem(char[] word, int length) {
     }
 
     List<CharsRef> list = new ArrayList<>();
+    if (length == 0) {
+      return list;
+    }
+
     RootProcessor processor =
         (stem, formID, stemException) -> {
           list.add(newStem(stem, stemException));
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
new file mode 100644
index 000000000000..1f931c96946c
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.IntSequenceOutputs;
+
+/**
+ * A data structure for memory-efficient word storage and fast lookup/enumeration. Each dictionary
+ * entry is stored as:
+ *
+ * <ol>
+ *   <li>the last character
+ *   <li>pointer to a similar entry for the prefix (all characters except the last one)
+ *   <li>value data: a list of ints representing word flags and morphological data, and a pointer to
+ *       hash collisions, if any
+ * </ol>
+ *
+ * There's only one entry for each prefix, so it's like a trie/{@link
+ * org.apache.lucene.util.fst.FST}, but a reversed one: each nodes points to a single previous nodes
+ * instead of several following ones. For example, "abc" and "abd" point to the same prefix entry
+ * "ab" which points to "a" which points to 0.<br>
+ * <br>
+ * The entries are stored in a contiguous byte array, identified by their offsets, using {@link
+ * DataOutput#writeVInt} ()} VINT} format for compression.
+ */
+class WordStorage {
+  /**
+   * A map from word's hash (modulo array's length) into the offset of the last entry in {@link
+   * #wordData} with this hash. Negated, if there's more than one entry with the same hash.
+   */
+  private final int[] hashTable;
+
+  /**
+   * An array of word entries:
+   *
+   * <ul>
+   *   <li>VINT: the word's last character
+   *   <li>VINT: pointer to the entry for the same word without the last character. It's relative:
+   *       the difference of this entry's start and the prefix's entry start. 0 for single-character
+   *       entries
+   *   <li>Optional, for non-leaf entries only:
+   *       <ul>
+   *         <li>VINT: the length of the word form data, returned from {@link #lookupWord}
+   *         <li>n * VINT: the word form data
+   *         <li>Optional, for hash-colliding entries only:
+   *             <ul>
+   *               <li>BYTE: 1 if the next collision entry has further collisions, 0 if it's the
+   *                   last of the entries with the same hash
+   *               <li>VINT: (relative) pointer to the previous entry with the same hash
+   *             </ul>
+   *       </ul>
+   * </ul>
+   */
+  private final byte[] wordData;
+
+  private WordStorage(int[] hashTable, byte[] wordData) {
+    this.hashTable = hashTable;
+    this.wordData = wordData;
+  }
+
+  IntsRef lookupWord(char[] word, int offset, int length) {
+    assert length > 0;
+
+    int hash = Math.abs(CharsRef.stringHashCode(word, offset, length) % hashTable.length);
+    int pos = hashTable[hash];
+    if (pos == 0) {
+      return null;
+    }
+
+    boolean collision = pos < 0;
+    pos = Math.abs(pos);
+
+    char lastChar = word[offset + length - 1];
+    ByteArrayDataInput in = new ByteArrayDataInput(wordData);
+    while (true) {
+      in.setPosition(pos);
+      char c = (char) in.readVInt();
+      int prevPos = pos - in.readVInt();
+      int beforeForms = in.getPosition();
+      boolean found = c == lastChar && isSameString(word, offset, length - 1, prevPos, in);
+      if (!collision && !found) {
+        return null;
+      }
+
+      in.setPosition(beforeForms);
+      int formLength = in.readVInt();
+      if (found) {
+        IntsRef forms = new IntsRef(formLength);
+        readForms(forms, in, formLength);
+        return forms;
+      } else {
+        skipVInts(in, formLength);
+      }
+
+      collision = in.readByte() == 1;
+      pos -= in.readVInt();
+    }
+  }
+
+  private static void skipVInts(ByteArrayDataInput in, int count) {
+    for (int i = 0; i < count; ) {
+      if (in.readByte() >= 0) i++;
+    }
+  }
+
+  /**
+   * @param processor is invoked for each word. Note that the passed arguments (word and form) are
+   *     reused, so they can be modified in any way, but may not be saved for later by the processor
+   */
+  void processAllWords(int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
+    CharsRef chars = new CharsRef(maxLength);
+    IntsRef forms = new IntsRef();
+    ByteArrayDataInput in = new ByteArrayDataInput(wordData);
+    for (int pos : hashTable) {
+      boolean collision = pos < 0;
+      pos = Math.abs(pos);
+
+      while (pos != 0) {
+        int wordStart = maxLength - 1;
+
+        in.setPosition(pos);
+        chars.chars[wordStart] = (char) in.readVInt();
+        int prevPos = pos - in.readVInt();
+
+        int dataLength = in.readVInt();
+        if (forms.ints.length < dataLength) {
+          forms.ints = new int[dataLength];
+        }
+        readForms(forms, in, dataLength);
+
+        int afterForms = in.getPosition();
+
+        while (prevPos != 0 && wordStart > 0) {
+          in.setPosition(prevPos);
+          chars.chars[--wordStart] = (char) in.readVInt();
+          prevPos -= in.readVInt();
+        }
+
+        if (wordStart > 0) {
+          chars.offset = wordStart;
+          chars.length = maxLength - wordStart;
+          processor.accept(chars, forms);
+        }
+
+        if (!collision) {
+          break;
+        }
+
+        in.setPosition(afterForms);
+        collision = in.readVInt() == 1;
+        pos -= in.readVInt();
+      }
+    }
+  }
+
+  private boolean isSameString(
+      char[] word, int offset, int length, int dataPos, ByteArrayDataInput in) {
+    for (int i = length - 1; i >= 0; i--) {
+      in.setPosition(dataPos);
+      char c = (char) in.readVInt();
+      if (c != word[i + offset]) {
+        return false;
+      }
+      dataPos -= in.readVInt();
+      if (dataPos == 0) {
+        return i == 0;
+      }
+    }
+    return length == 0;
+  }
+
+  private void readForms(IntsRef forms, ByteArrayDataInput in, int length) {
+    for (int i = 0; i < length; i++) {
+      forms.ints[i] = in.readVInt();
+    }
+    forms.length = length;
+  }
+
+  static class Builder {
+    private final boolean hasCustomMorphData;
+    private final int[] hashTable;
+    private byte[] wordData;
+    private final int[] chainLengths;
+
+    private final List<char[]> group = new ArrayList<>();
+    private final List<Integer> morphDataIDs = new ArrayList<>();
+    private String currentEntry = null;
+    private final FlagEnumerator flagEnumerator;
+
+    private final ByteArrayDataOutput dataWriter;
+    int commonPrefixLength, commonPrefixPos;
+
+    Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
+      this.flagEnumerator = flagEnumerator;
+      this.hasCustomMorphData = hasCustomMorphData;
+
+      hashTable = new int[wordCount];
+      wordData = new byte[wordCount * 6];
+
+      dataWriter = new ByteArrayDataOutput(wordData);
+      dataWriter.writeByte((byte) 0); // zero index is root, contains nothing
+      chainLengths = new int[hashTable.length];
+    }
+
+    void add(String entry, char[] flags, int morphDataID) throws IOException {
+      if (!entry.equals(currentEntry)) {
+        if (currentEntry != null) {
+          if (entry.compareTo(currentEntry) < 0) {
+            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
+          }
+          int pos = flushGroup();
+
+          commonPrefixLength = GeneratingSuggester.commonPrefix(currentEntry, entry);
+          ByteArrayDataInput in = new ByteArrayDataInput(wordData);
+          in.setPosition(pos);
+          for (int i = currentEntry.length() - 1; i >= commonPrefixLength; i--) {
+            char c = (char) in.readVInt();
+            assert c == currentEntry.charAt(i);
+            pos -= in.readVInt();
+            in.setPosition(pos);
+          }
+          commonPrefixPos = pos;
+        }
+        currentEntry = entry;
+      }
+
+      group.add(flags);
+      if (hasCustomMorphData) {
+        morphDataIDs.add(morphDataID);
+      }
+    }
+
+    private int flushGroup() throws IOException {
+      IntsRefBuilder currentOrds = new IntsRefBuilder();
+
+      boolean hasNonHidden = false;
+      for (char[] flags : group) {
+        if (!hasHiddenFlag(flags)) {
+          hasNonHidden = true;
+          break;
+        }
+      }
+
+      for (int i = 0; i < group.size(); i++) {
+        char[] flags = group.get(i);
+        if (hasNonHidden && hasHiddenFlag(flags)) {
+          continue;
+        }
+
+        currentOrds.append(flagEnumerator.add(flags));
+        if (hasCustomMorphData) {
+          currentOrds.append(morphDataIDs.get(i));
+        }
+      }
+
+      int lastPos = commonPrefixPos;
+      for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) {
+        int pos = dataWriter.getPosition();
+        ensureArraySize(0, false);
+        dataWriter.writeVInt(currentEntry.charAt(i));
+        dataWriter.writeVInt(pos - lastPos);
+        lastPos = pos;
+      }
+
+      int pos = dataWriter.getPosition();
+      int hash = Math.abs(currentEntry.hashCode() % hashTable.length);
+      int collision = hashTable[hash];
+      hashTable[hash] = collision == 0 ? pos : -pos;
+
+      if (++chainLengths[hash] > 20) {
+        throw new RuntimeException(
+            "Too many collisions, please report this to dev@lucene.apache.org");
+      }
+
+      ensureArraySize(currentOrds.length(), collision != 0);
+      dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1));
+      dataWriter.writeVInt(pos - lastPos);
+      IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter);
+      if (collision != 0) {
+        dataWriter.writeByte(collision < 0 ? (byte) 1 : 0);
+        dataWriter.writeVInt(pos - Math.abs(collision));
+      }
+
+      group.clear();
+      morphDataIDs.clear();
+      return pos;
+    }
+
+    private void ensureArraySize(int valueLength, boolean hasCollision) {
+      int pos = dataWriter.getPosition();
+      int maxEntrySize = 8 + 4 * (valueLength + 1) + (hasCollision ? 5 : 0);
+      while (wordData.length < pos + maxEntrySize) {
+        wordData = ArrayUtil.grow(wordData);
+        dataWriter.reset(wordData, pos, wordData.length - pos);
+      }
+    }
+
+    private static boolean hasHiddenFlag(char[] flags) {
+      for (char flag : flags) {
+        if (flag == Dictionary.HIDDEN_FLAG) {
+          return true;
+        }
+      }
+      return false;
+    }
+
+    WordStorage build() throws IOException {
+      flushGroup();
+      return new WordStorage(
+          hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index 6fac33d2d9ff..acef45bb4e12 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -160,8 +160,7 @@ public void testDictionariesLoadSuccessfully() throws Exception {
           try {
             Dictionary dic = loadDictionary(aff);
             totalMemory.addAndGet(RamUsageTester.sizeOf(dic));
-            totalWords.addAndGet(
-                RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes));
+            totalWords.addAndGet(RamUsageTester.sizeOf(dic.words));
             System.out.println(aff + "\t" + memoryUsageSummary(dic));
           } catch (Throwable e) {
             failures.add(aff);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index f74654927429..bc69f6c5b812 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -57,12 +57,12 @@ public static void resolveCorpora() {
 
   @Test
   public void en() throws Exception {
-    checkAnalysisPerformance("en", 1_000_000);
+    checkAnalysisPerformance("en", 1_200_000);
   }
 
   @Test
   public void en_suggest() throws Exception {
-    checkSuggestionPerformance("en", 1_200);
+    checkSuggestionPerformance("en", 3_000);
   }
 
   @Test
@@ -72,7 +72,7 @@ public void de() throws Exception {
 
   @Test
   public void de_suggest() throws Exception {
-    checkSuggestionPerformance("de", 55);
+    checkSuggestionPerformance("de", 60);
   }
 
   @Test

From fb9805f4d3439b8e020a7e88abe301c3038345b3 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Fri, 5 Mar 2021 16:17:04 +0100
Subject: [PATCH 2/8] update comment

---
 .../apache/lucene/analysis/hunspell/GeneratingSuggester.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 3c508ba5f108..68af022011ad 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -71,7 +71,7 @@ private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
 
           assert rootChars.length > 0;
           if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
-            assert rootChars.length < word.length(); // nextKey takes care of longer keys
+            assert rootChars.length < word.length(); // processAllWords takes care of longer keys
             return;
           }
 

From 7b048c5610dcf3077c000a5a4f84230e635c5106 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Fri, 5 Mar 2021 16:21:19 +0100
Subject: [PATCH 3/8] make fields private

---
 .../apache/lucene/analysis/hunspell/WordStorage.java | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
index 1f931c96946c..36e9f3fb1e03 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -204,16 +204,16 @@ private void readForms(IntsRef forms, ByteArrayDataInput in, int length) {
   static class Builder {
     private final boolean hasCustomMorphData;
     private final int[] hashTable;
-    private byte[] wordData;
     private final int[] chainLengths;
+    private final FlagEnumerator flagEnumerator;
+    private final ByteArrayDataOutput dataWriter;
+
+    private byte[] wordData;
 
+    private int commonPrefixLength, commonPrefixPos;
+    private String currentEntry = null;
     private final List<char[]> group = new ArrayList<>();
     private final List<Integer> morphDataIDs = new ArrayList<>();
-    private String currentEntry = null;
-    private final FlagEnumerator flagEnumerator;
-
-    private final ByteArrayDataOutput dataWriter;
-    int commonPrefixLength, commonPrefixPos;
 
     Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
       this.flagEnumerator = flagEnumerator;

From 469cfc67d4ecfdc390b4b3e245286026b2e764e4 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 8 Mar 2021 08:49:27 +0100
Subject: [PATCH 4/8] fix lookupWord false positive

---
 .../java/org/apache/lucene/analysis/hunspell/WordStorage.java | 2 +-
 .../org/apache/lucene/analysis/hunspell/TestDictionary.java   | 4 ++++
 .../src/test/org/apache/lucene/analysis/hunspell/simple.dic   | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
index 36e9f3fb1e03..c8bd3c56f961 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -191,7 +191,7 @@ private boolean isSameString(
         return i == 0;
       }
     }
-    return length == 0;
+    return length == 0 && dataPos == 0;
   }
 
   private void readForms(IntsRef forms, ByteArrayDataInput in, int length) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 1b64c3a9d319..9c7a0e06bba7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -50,6 +50,10 @@ public void testSimpleDictionary() throws Exception {
     assertNotNull(ordList);
     assertEquals(1, ordList.length);
     assertEquals('A', assertSingleFlag(dictionary, ordList));
+
+    assertNotNull(dictionary.lookupWord(new char[] {'a', 'b'}, 0, 2));
+    assertNotNull(dictionary.lookupWord(new char[] {'d', 'b'}, 0, 2));
+    assertNull(dictionary.lookupWord(new char[] {'b'}, 0, 1));
   }
 
   private static char assertSingleFlag(Dictionary dictionary, IntsRef ordList) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
index f7bbab3ba676..2809611b8764 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
@@ -8,3 +8,4 @@ lucene
 mahout/A
 moo/E
 olr/B
+db
\ No newline at end of file

From e69390b268e34f0dc92293640f9bf3cac219612d Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 8 Mar 2021 08:56:10 +0100
Subject: [PATCH 5/8] don't lookup empty stems after stripping the whole word

---
 .../src/java/org/apache/lucene/analysis/hunspell/Stemmer.java  | 2 ++
 .../org/apache/lucene/analysis/hunspell/TestFullStrip.java     | 2 ++
 .../src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff | 3 +++
 .../src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic | 3 ++-
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 40ad17c609fd..488adfd2b445 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -488,6 +488,8 @@ private char[] stripAffix(
     int stripEnd = dictionary.stripOffsets[stripOrd + 1];
     int stripLen = stripEnd - stripStart;
 
+    if (stripLen + deAffixedLen == 0) return null;
+
     char[] stripData = dictionary.stripData;
     int condition = dictionary.getAffixCondition(affix);
     if (condition != 0) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java
index 3634a83c9fa7..7f48d7d6285b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java
@@ -26,5 +26,7 @@ public static void beforeClass() throws Exception {
 
   public void testStemming() {
     assertStemsTo("tasty", "beer");
+    assertStemsTo("as", "a");
+    assertStemsTo("s");
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff
index 9c2de7f7cff0..2cf00cd9d30f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff
@@ -4,3 +4,6 @@ FULLSTRIP
 
 SFX A Y 1
 SFX A   beer        tasty  .
+
+SFX S Y 1
+SFX S	0	s	.
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic
index c948f1846352..8f594e355aff 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic
@@ -1,2 +1,3 @@
-1
+2
 beer/A
+a/S
\ No newline at end of file

From e28b50bae85f4b896b3c31f4bc80a814c988e99b Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 8 Mar 2021 09:43:40 +0100
Subject: [PATCH 6/8] add/fix WordStorage comments

---
 .../lucene/analysis/hunspell/WordStorage.java | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
index c8bd3c56f961..81a818616665 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -41,7 +41,7 @@
  * </ol>
  *
  * There's only one entry for each prefix, so it's like a trie/{@link
- * org.apache.lucene.util.fst.FST}, but a reversed one: each nodes points to a single previous nodes
+ * org.apache.lucene.util.fst.FST}, but a reversed one: each node points to a single previous node
  * instead of several following ones. For example, "abc" and "abd" point to the same prefix entry
  * "ab" which points to "a" which points to 0.<br>
  * <br>
@@ -60,9 +60,9 @@ class WordStorage {
    *
    * <ul>
    *   <li>VINT: the word's last character
-   *   <li>VINT: pointer to the entry for the same word without the last character. It's relative:
-   *       the difference of this entry's start and the prefix's entry start. 0 for single-character
-   *       entries
+   *   <li>VINT: a delta pointer to the entry for the same word without the last character.
+   *       Precisely, it's the difference of this entry's start and the prefix's entry start. 0 for
+   *       single-character entries
    *   <li>Optional, for non-leaf entries only:
    *       <ul>
    *         <li>VINT: the length of the word form data, returned from {@link #lookupWord}
@@ -71,7 +71,7 @@ class WordStorage {
    *             <ul>
    *               <li>BYTE: 1 if the next collision entry has further collisions, 0 if it's the
    *                   last of the entries with the same hash
-   *               <li>VINT: (relative) pointer to the previous entry with the same hash
+   *               <li>VINT: (delta) pointer to the previous entry with the same hash
    *             </ul>
    *       </ul>
    * </ul>
@@ -129,6 +129,8 @@ private static void skipVInts(ByteArrayDataInput in, int count) {
   }
 
   /**
+   * @param maxLength the limit on the length of words to be processed, the callback won't be
+   *     invoked for the longer ones
    * @param processor is invoked for each word. Note that the passed arguments (word and form) are
    *     reused, so they can be modified in any way, but may not be saved for later by the processor
    */
@@ -215,6 +217,10 @@ static class Builder {
     private final List<char[]> group = new ArrayList<>();
     private final List<Integer> morphDataIDs = new ArrayList<>();
 
+    /**
+     * @param wordCount an approximate number of the words in the resulting dictionary, used to
+     *     pre-size the hash table
+     */
     Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
       this.flagEnumerator = flagEnumerator;
       this.hasCustomMorphData = hasCustomMorphData;
@@ -227,6 +233,10 @@ static class Builder {
       chainLengths = new int[hashTable.length];
     }
 
+    /**
+     * Add a dictionary entry. This method should be called for entries sorted non-descending by
+     * {@link String#compareTo} rules.
+     */
     void add(String entry, char[] flags, int morphDataID) throws IOException {
       if (!entry.equals(currentEntry)) {
         if (currentEntry != null) {
@@ -278,6 +288,7 @@ private int flushGroup() throws IOException {
         }
       }
 
+      // write the non-leaf entries for chars after the shared prefix, except the last one
       int lastPos = commonPrefixPos;
       for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) {
         int pos = dataWriter.getPosition();
@@ -297,6 +308,7 @@ private int flushGroup() throws IOException {
             "Too many collisions, please report this to dev@lucene.apache.org");
       }
 
+      // write the leaf entry for the last character
       ensureArraySize(currentOrds.length(), collision != 0);
       dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1));
       dataWriter.writeVInt(pos - lastPos);

From f9cd8e5c80eabb969e3cca3837ca31c5383e51dd Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 8 Mar 2021 10:09:41 +0100
Subject: [PATCH 7/8] fix processAllWords, add a test

---
 .../lucene/analysis/hunspell/WordStorage.java |  2 +-
 .../analysis/hunspell/TestDictionary.java     | 46 +++++++++++++++----
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
index 81a818616665..e25a41cb8934 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -163,7 +163,7 @@ void processAllWords(int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
           prevPos -= in.readVInt();
         }
 
-        if (wordStart > 0) {
+        if (prevPos == 0) {
           chars.offset = wordStart;
           chars.length = maxLength - wordStart;
           processor.accept(chars, forms);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 9c7a0e06bba7..88a515fdbf21 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -16,14 +16,19 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.FilterInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
+import java.io.InputStreamReader;
 import java.text.ParseException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
@@ -63,6 +68,31 @@ private static char assertSingleFlag(Dictionary dictionary, IntsRef ordList) {
     return flags[0];
   }
 
+  public void testProcessAllWords() throws Exception {
+    Dictionary dictionary = loadDictionary("simple.aff", "simple.dic");
+
+    try (InputStream stream = getClass().getResourceAsStream("simple.dic")) {
+      BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
+      Set<String> allWords =
+          reader.lines().skip(1).map(s -> s.split("/")[0]).collect(Collectors.toSet());
+      int maxLength = allWords.stream().mapToInt(String::length).max().orElseThrow();
+
+      for (int i = 1; i <= maxLength + 1; i++) {
+        checkProcessWords(dictionary, allWords, i);
+      }
+    }
+  }
+
+  private void checkProcessWords(Dictionary dictionary, Set<String> allWords, int maxLength) {
+    Set<String> processed = new HashSet<>();
+    dictionary.words.processAllWords(maxLength, (word, __) -> processed.add(word.toString()));
+
+    Set<String> filtered =
+        allWords.stream().filter(s -> s.length() <= maxLength).collect(Collectors.toSet());
+
+    assertEquals("For length " + maxLength, filtered, processed);
+  }
+
   public void testCompressedDictionary() throws Exception {
     Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
     assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
@@ -96,8 +126,8 @@ public void testInvalidData() {
   }
 
   public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
-    byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
-    byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
+    byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(UTF_8);
+    byte[] dic = "1\nfoo/42".getBytes(UTF_8);
 
     Dictionary dictionary =
         new Dictionary(
@@ -210,14 +240,14 @@ private static String getDictionaryEncoding(String affFile) throws IOException,
         new Dictionary(
             new ByteBuffersDirectory(),
             "",
-            new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
-            new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
+            new ByteArrayInputStream(affFile.getBytes(UTF_8)),
+            new ByteArrayInputStream("1\nmock".getBytes(UTF_8)));
     return dictionary.decoder.charset().name();
   }
 
   public void testFlagWithCrazyWhitespace() {
-    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", StandardCharsets.UTF_8));
-    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG    UTF-8", StandardCharsets.UTF_8));
+    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", UTF_8));
+    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG    UTF-8", UTF_8));
   }
 
   @Test
@@ -226,7 +256,7 @@ public void testUtf8Flag() {
         Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", Dictionary.DEFAULT_CHARSET);
 
     String src = "привет";
-    String asAscii = new String(src.getBytes(StandardCharsets.UTF_8), Dictionary.DEFAULT_CHARSET);
+    String asAscii = new String(src.getBytes(UTF_8), Dictionary.DEFAULT_CHARSET);
     assertNotEquals(src, asAscii);
     assertEquals(src, new String(strategy.parseFlags(asAscii)));
   }

From 4959886c25833c303b1d1ea0bb31480e6880a885 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 8 Mar 2021 10:31:56 +0100
Subject: [PATCH 8/8] fix minor review comments

---
 .../lucene/analysis/hunspell/WordStorage.java | 49 +++++++++++--------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
index e25a41cb8934..1eaf1bb8dc96 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -206,29 +206,45 @@ private void readForms(IntsRef forms, ByteArrayDataInput in, int length) {
   static class Builder {
     private final boolean hasCustomMorphData;
     private final int[] hashTable;
-    private final int[] chainLengths;
-    private final FlagEnumerator flagEnumerator;
-    private final ByteArrayDataOutput dataWriter;
-
     private byte[] wordData;
+    private final int[] chainLengths;
 
-    private int commonPrefixLength, commonPrefixPos;
-    private String currentEntry = null;
+    private final IntsRefBuilder currentOrds = new IntsRefBuilder();
     private final List<char[]> group = new ArrayList<>();
     private final List<Integer> morphDataIDs = new ArrayList<>();
+    private String currentEntry = null;
+    private final int wordCount;
+    private final FlagEnumerator flagEnumerator;
+
+    private final ByteArrayDataOutput dataWriter;
+    private int commonPrefixLength, commonPrefixPos;
+    private int actualWords;
 
     /**
      * @param wordCount an approximate number of the words in the resulting dictionary, used to
-     *     pre-size the hash table
+     *     pre-size the hash table. This argument can be a bit larger than the actual word count,
+     *     but not smaller.
      */
     Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
+      this.wordCount = wordCount;
       this.flagEnumerator = flagEnumerator;
       this.hasCustomMorphData = hasCustomMorphData;
 
       hashTable = new int[wordCount];
       wordData = new byte[wordCount * 6];
 
-      dataWriter = new ByteArrayDataOutput(wordData);
+      dataWriter =
+          new ByteArrayDataOutput(wordData) {
+            @Override
+            public void writeByte(byte b) {
+              int pos = getPosition();
+              if (pos == wordData.length) {
+                wordData = ArrayUtil.grow(wordData);
+                reset(wordData, pos, wordData.length - pos);
+              }
+              super.writeByte(b);
+            }
+          };
       dataWriter.writeByte((byte) 0); // zero index is root, contains nothing
       chainLengths = new int[hashTable.length];
     }
@@ -266,8 +282,11 @@ void add(String entry, char[] flags, int morphDataID) throws IOException {
     }
 
     private int flushGroup() throws IOException {
-      IntsRefBuilder currentOrds = new IntsRefBuilder();
+      if (++actualWords > wordCount) {
+        throw new RuntimeException("Don't add more words than wordCount!");
+      }
 
+      currentOrds.clear();
       boolean hasNonHidden = false;
       for (char[] flags : group) {
         if (!hasHiddenFlag(flags)) {
@@ -292,7 +311,6 @@ private int flushGroup() throws IOException {
       int lastPos = commonPrefixPos;
       for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) {
         int pos = dataWriter.getPosition();
-        ensureArraySize(0, false);
         dataWriter.writeVInt(currentEntry.charAt(i));
         dataWriter.writeVInt(pos - lastPos);
         lastPos = pos;
@@ -309,7 +327,6 @@ private int flushGroup() throws IOException {
       }
 
       // write the leaf entry for the last character
-      ensureArraySize(currentOrds.length(), collision != 0);
       dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1));
       dataWriter.writeVInt(pos - lastPos);
       IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter);
@@ -323,15 +340,6 @@ private int flushGroup() throws IOException {
       return pos;
     }
 
-    private void ensureArraySize(int valueLength, boolean hasCollision) {
-      int pos = dataWriter.getPosition();
-      int maxEntrySize = 8 + 4 * (valueLength + 1) + (hasCollision ? 5 : 0);
-      while (wordData.length < pos + maxEntrySize) {
-        wordData = ArrayUtil.grow(wordData);
-        dataWriter.reset(wordData, pos, wordData.length - pos);
-      }
-    }
-
     private static boolean hasHiddenFlag(char[] flags) {
       for (char flag : flags) {
         if (flag == Dictionary.HIDDEN_FLAG) {
@@ -342,6 +350,7 @@ private static boolean hasHiddenFlag(char[] flags) {
     }
 
     WordStorage build() throws IOException {
+      assert !group.isEmpty() : "build() should be only called once";
       flushGroup();
       return new WordStorage(
           hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));