From 4842e0c9cab35239b6d1da619b94a640e7227510 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 5 Mar 2021 14:34:36 +0100 Subject: [PATCH 1/8] LUCENE-9825: Hunspell: reverse the "words" trie for faster word lookup/suggestions --- .../lucene/analysis/hunspell/Dictionary.java | 123 +------ .../hunspell/GeneratingSuggester.java | 88 ++--- .../analysis/hunspell/ModifyingSuggester.java | 2 + .../lucene/analysis/hunspell/Stemmer.java | 4 + .../lucene/analysis/hunspell/WordStorage.java | 338 ++++++++++++++++++ .../hunspell/TestAllDictionaries.java | 3 +- .../analysis/hunspell/TestPerformance.java | 6 +- 7 files changed, 396 insertions(+), 168 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 450c77da942a..2672addd88c0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -16,7 +16,8 @@ */ package org.apache.lucene.analysis.hunspell; -import static org.apache.lucene.analysis.hunspell.AffixKind.*; +import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX; +import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -53,8 +54,6 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; @@ -91,14 +90,8 @@ public class Dictionary { */ ArrayList patterns = new ArrayList<>(); - /** - * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list - * for flagLookup. - */ - FST words; - - /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */ - FixedBitSet wordHashes; + /** The entries in the .dic file, mapping to their set of flags */ + WordStorage words; /** * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish @@ -257,9 +250,8 @@ public void close() { // read dictionary entries IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); int wordCount = mergeDictionaries(dictionaries, decoder, unsorted); - wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10)); String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); - words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator); + words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount); flagLookup = flagEnumerator.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed @@ -272,36 +264,27 @@ int formStep() { /** Looks up Hunspell word forms from the dictionary */ IntsRef lookupWord(char[] word, int offset, int length) { - int hash = CharsRef.stringHashCode(word, offset, length); - if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) { - return null; - } - - return lookup(words, word, offset, length); + return words.lookupWord(word, offset, length); } // only for testing IntsRef lookupPrefix(char[] word) { - return lookup(prefixes, word, 0, word.length); + return lookup(prefixes, word); } // only for testing IntsRef lookupSuffix(char[] word) { - return lookup(suffixes, word, 0, word.length); + return lookup(suffixes, word); } - IntsRef lookup(FST fst, char[] word, int offset, int length) { - if (fst == null) { - return null; - } + private IntsRef lookup(FST fst, char[] word) { final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc arc = fst.getFirstArc(new FST.Arc<>()); // Accumulate output as we go IntsRef output = fst.outputs.getNoOutput(); - int l = offset + length; - for (int i = offset, cp; i < l; i += Character.charCount(cp)) { - cp = Character.codePointAt(word, i, l); + for (int i = 0, cp; i < word.length; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, word.length); output = nextArc(fst, arc, bytesReader, output, cp); if (output == null) { return null; @@ -1134,13 +1117,13 @@ public int compare(BytesRef o1, BytesRef o2) { return sorted; } - private FST readSortedDictionaries( - Directory tempDir, String sorted, FlagEnumerator flags) throws IOException { + private WordStorage readSortedDictionaries( + Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException { boolean success = false; Map morphIndices = new HashMap<>(); - EntryGrouper grouper = new EntryGrouper(flags); + WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags); try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { @@ -1180,6 +1163,8 @@ private FST readSortedDictionaries( entry = line.substring(0, flagSep); } + if (entry.isEmpty()) continue; + int morphDataID = 0; if (end + 1 < line.length()) { List morphFields = readMorphFields(entry, line.substring(end + 1)); @@ -1189,14 +1174,12 @@ private FST readSortedDictionaries( } } - wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length()); - grouper.add(entry, wordForm, morphDataID); + builder.add(entry, wordForm, morphDataID); } // finalize last entry - grouper.flushGroup(); success = true; - return grouper.words.compile(); + return builder.build(); } finally { if (success) { tempDir.deleteFile(sorted); @@ -1275,76 +1258,6 @@ boolean isDotICaseChangeDisallowed(char[] word) { return word[0] == 'İ' && !alternateCasing; } - private class EntryGrouper { - final FSTCompiler words = - new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton()); - private final List group = new ArrayList<>(); - private final List morphDataIDs = new ArrayList<>(); - private final IntsRefBuilder scratchInts = new IntsRefBuilder(); - private String currentEntry = null; - private final FlagEnumerator flagEnumerator; - - EntryGrouper(FlagEnumerator flagEnumerator) { - this.flagEnumerator = flagEnumerator; - } - - void add(String entry, char[] flags, int morphDataID) throws IOException { - if (!entry.equals(currentEntry)) { - if (currentEntry != null) { - if (entry.compareTo(currentEntry) < 0) { - throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); - } - flushGroup(); - } - currentEntry = entry; - } - - group.add(flags); - if (hasCustomMorphData) { - morphDataIDs.add(morphDataID); - } - } - - void flushGroup() throws IOException { - IntsRefBuilder currentOrds = new IntsRefBuilder(); - - boolean hasNonHidden = false; - for (char[] flags : group) { - if (!hasHiddenFlag(flags)) { - hasNonHidden = true; - break; - } - } - - for (int i = 0; i < group.size(); i++) { - char[] flags = group.get(i); - if (hasNonHidden && hasHiddenFlag(flags)) { - continue; - } - - currentOrds.append(flagEnumerator.add(flags)); - if (hasCustomMorphData) { - currentOrds.append(morphDataIDs.get(i)); - } - } - - Util.toUTF32(currentEntry, scratchInts); - words.add(scratchInts.get(), currentOrds.get()); - - group.clear(); - morphDataIDs.clear(); - } - } - - private static boolean hasHiddenFlag(char[] flags) { - for (char flag : flags) { - if (flag == HIDDEN_FLAG) { - return true; - } - } - return false; - } - private void parseAlias(String line) { String[] ruleArgs = line.split("\\s+"); if (aliases == null) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 0b68bce5c35b..3c508ba5f108 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -20,7 +20,6 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD; -import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.LinkedHashSet; @@ -30,11 +29,8 @@ import java.util.Set; import java.util.TreeSet; import java.util.stream.Collectors; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.IntsRefFSTEnum; -import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput; /** * A class that traverses the entire dictionary and applies affix rules to check if those yield @@ -68,66 +64,42 @@ private List>> findSimilarDictionaryEntries( boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de"); TrigramAutomaton automaton = new TrigramAutomaton(word); - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(dictionary.words); - InputOutput mapping; - while ((mapping = nextKey(fstEnum, word.length() + 4)) != null) { - speller.checkCanceled.run(); + dictionary.words.processAllWords( + word.length() + 4, + (rootChars, forms) -> { + speller.checkCanceled.run(); - IntsRef key = mapping.input; - assert key.length > 0; - if (Math.abs(key.length - word.length()) > MAX_ROOT_LENGTH_DIFF) { - assert key.length < word.length(); // nextKey takes care of longer keys - continue; - } - - String root = toString(key); - filterSuitableEntries(root, mapping.output, entries); - if (entries.isEmpty()) continue; + assert rootChars.length > 0; + if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) { + assert rootChars.length < word.length(); // nextKey takes care of longer keys + return; + } - if (ignoreTitleCaseRoots && WordCase.caseOf(root) == WordCase.TITLE) { - continue; - } + String root = rootChars.toString(); + filterSuitableEntries(root, forms, entries); + if (entries.isEmpty()) return; - String lower = dictionary.toLowerCase(root); - int sc = - automaton.ngramScore(lower) - longerWorsePenalty(word, lower) + commonPrefix(word, root); + if (ignoreTitleCaseRoots && WordCase.caseOf(rootChars) == WordCase.TITLE) { + return; + } - if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { - continue; - } + String lower = dictionary.toLowerCase(root); + int sc = + automaton.ngramScore(lower) + - longerWorsePenalty(word, lower) + + commonPrefix(word, root); - entries.forEach(e -> roots.add(new Weighted<>(e, sc))); - while (roots.size() > MAX_ROOTS) { - roots.poll(); - } - } - return roots.stream().sorted().collect(Collectors.toList()); - } + if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { + return; + } - private static InputOutput nextKey(IntsRefFSTEnum fstEnum, int maxLen) { - try { - InputOutput next = fstEnum.next(); - while (next != null && next.input.length > maxLen) { - int offset = next.input.offset; - int[] ints = ArrayUtil.copyOfSubArray(next.input.ints, offset, offset + maxLen); - if (ints[ints.length - 1] == Integer.MAX_VALUE) { - throw new AssertionError("Too large char"); - } - ints[ints.length - 1]++; - next = fstEnum.seekCeil(new IntsRef(ints, 0, ints.length)); - } - return next; - } catch (IOException e) { - throw new RuntimeException(e); - } - } + entries.forEach(e -> roots.add(new Weighted<>(e, sc))); + while (roots.size() > MAX_ROOTS) { + roots.poll(); + } + }); - private static String toString(IntsRef key) { - char[] chars = new char[key.length]; - for (int i = 0; i < key.length; i++) { - chars[i] = (char) key.ints[i + key.offset]; - } - return new String(chars); + return roots.stream().sorted().collect(Collectors.toList()); } private void filterSuitableEntries(String word, IntsRef forms, List> result) { @@ -363,7 +335,7 @@ private List getMostRelevantSuggestions( return result; } - private static int commonPrefix(String s1, String s2) { + static int commonPrefix(String s1, String s2) { int i = 0; int limit = Math.min(s1.length(), s2.length()); while (i < limit && s1.charAt(i) == s2.charAt(i)) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java index 5e292745d56c..e135fb6ed0f2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java @@ -234,6 +234,8 @@ private void tryLongSwap(String word) { } private void tryRemovingChar(String word) { + if (word.length() == 1) return; + for (int i = 0; i < word.length(); i++) { trySuggestion(word.substring(0, i) + word.substring(i + 1)); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index f864dee2db34..40ad17c609fd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -94,6 +94,10 @@ public List stem(char[] word, int length) { } List list = new ArrayList<>(); + if (length == 0) { + return list; + } + RootProcessor processor = (stem, formID, stemException) -> { list.add(newStem(stem, stemException)); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java new file mode 100644 index 000000000000..1f931c96946c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.IntSequenceOutputs; + +/** + * A data structure for memory-efficient word storage and fast lookup/enumeration. Each dictionary + * entry is stored as: + * + *
    + *
  1. the last character + *
  2. pointer to a similar entry for the prefix (all characters except the last one) + *
  3. value data: a list of ints representing word flags and morphological data, and a pointer to + * hash collisions, if any + *
+ * + * There's only one entry for each prefix, so it's like a trie/{@link + * org.apache.lucene.util.fst.FST}, but a reversed one: each nodes points to a single previous nodes + * instead of several following ones. For example, "abc" and "abd" point to the same prefix entry + * "ab" which points to "a" which points to 0.
+ *
+ * The entries are stored in a contiguous byte array, identified by their offsets, using {@link + * DataOutput#writeVInt} ()} VINT} format for compression. + */ +class WordStorage { + /** + * A map from word's hash (modulo array's length) into the offset of the last entry in {@link + * #wordData} with this hash. Negated, if there's more than one entry with the same hash. + */ + private final int[] hashTable; + + /** + * An array of word entries: + * + *
    + *
  • VINT: the word's last character + *
  • VINT: pointer to the entry for the same word without the last character. It's relative: + * the difference of this entry's start and the prefix's entry start. 0 for single-character + * entries + *
  • Optional, for non-leaf entries only: + *
      + *
    • VINT: the length of the word form data, returned from {@link #lookupWord} + *
    • n * VINT: the word form data + *
    • Optional, for hash-colliding entries only: + *
        + *
      • BYTE: 1 if the next collision entry has further collisions, 0 if it's the + * last of the entries with the same hash + *
      • VINT: (relative) pointer to the previous entry with the same hash + *
      + *
    + *
+ */ + private final byte[] wordData; + + private WordStorage(int[] hashTable, byte[] wordData) { + this.hashTable = hashTable; + this.wordData = wordData; + } + + IntsRef lookupWord(char[] word, int offset, int length) { + assert length > 0; + + int hash = Math.abs(CharsRef.stringHashCode(word, offset, length) % hashTable.length); + int pos = hashTable[hash]; + if (pos == 0) { + return null; + } + + boolean collision = pos < 0; + pos = Math.abs(pos); + + char lastChar = word[offset + length - 1]; + ByteArrayDataInput in = new ByteArrayDataInput(wordData); + while (true) { + in.setPosition(pos); + char c = (char) in.readVInt(); + int prevPos = pos - in.readVInt(); + int beforeForms = in.getPosition(); + boolean found = c == lastChar && isSameString(word, offset, length - 1, prevPos, in); + if (!collision && !found) { + return null; + } + + in.setPosition(beforeForms); + int formLength = in.readVInt(); + if (found) { + IntsRef forms = new IntsRef(formLength); + readForms(forms, in, formLength); + return forms; + } else { + skipVInts(in, formLength); + } + + collision = in.readByte() == 1; + pos -= in.readVInt(); + } + } + + private static void skipVInts(ByteArrayDataInput in, int count) { + for (int i = 0; i < count; ) { + if (in.readByte() >= 0) i++; + } + } + + /** + * @param processor is invoked for each word. Note that the passed arguments (word and form) are + * reused, so they can be modified in any way, but may not be saved for later by the processor + */ + void processAllWords(int maxLength, BiConsumer processor) { + CharsRef chars = new CharsRef(maxLength); + IntsRef forms = new IntsRef(); + ByteArrayDataInput in = new ByteArrayDataInput(wordData); + for (int pos : hashTable) { + boolean collision = pos < 0; + pos = Math.abs(pos); + + while (pos != 0) { + int wordStart = maxLength - 1; + + in.setPosition(pos); + chars.chars[wordStart] = (char) in.readVInt(); + int prevPos = pos - in.readVInt(); + + int dataLength = in.readVInt(); + if (forms.ints.length < dataLength) { + forms.ints = new int[dataLength]; + } + readForms(forms, in, dataLength); + + int afterForms = in.getPosition(); + + while (prevPos != 0 && wordStart > 0) { + in.setPosition(prevPos); + chars.chars[--wordStart] = (char) in.readVInt(); + prevPos -= in.readVInt(); + } + + if (wordStart > 0) { + chars.offset = wordStart; + chars.length = maxLength - wordStart; + processor.accept(chars, forms); + } + + if (!collision) { + break; + } + + in.setPosition(afterForms); + collision = in.readVInt() == 1; + pos -= in.readVInt(); + } + } + } + + private boolean isSameString( + char[] word, int offset, int length, int dataPos, ByteArrayDataInput in) { + for (int i = length - 1; i >= 0; i--) { + in.setPosition(dataPos); + char c = (char) in.readVInt(); + if (c != word[i + offset]) { + return false; + } + dataPos -= in.readVInt(); + if (dataPos == 0) { + return i == 0; + } + } + return length == 0; + } + + private void readForms(IntsRef forms, ByteArrayDataInput in, int length) { + for (int i = 0; i < length; i++) { + forms.ints[i] = in.readVInt(); + } + forms.length = length; + } + + static class Builder { + private final boolean hasCustomMorphData; + private final int[] hashTable; + private byte[] wordData; + private final int[] chainLengths; + + private final List group = new ArrayList<>(); + private final List morphDataIDs = new ArrayList<>(); + private String currentEntry = null; + private final FlagEnumerator flagEnumerator; + + private final ByteArrayDataOutput dataWriter; + int commonPrefixLength, commonPrefixPos; + + Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { + this.flagEnumerator = flagEnumerator; + this.hasCustomMorphData = hasCustomMorphData; + + hashTable = new int[wordCount]; + wordData = new byte[wordCount * 6]; + + dataWriter = new ByteArrayDataOutput(wordData); + dataWriter.writeByte((byte) 0); // zero index is root, contains nothing + chainLengths = new int[hashTable.length]; + } + + void add(String entry, char[] flags, int morphDataID) throws IOException { + if (!entry.equals(currentEntry)) { + if (currentEntry != null) { + if (entry.compareTo(currentEntry) < 0) { + throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); + } + int pos = flushGroup(); + + commonPrefixLength = GeneratingSuggester.commonPrefix(currentEntry, entry); + ByteArrayDataInput in = new ByteArrayDataInput(wordData); + in.setPosition(pos); + for (int i = currentEntry.length() - 1; i >= commonPrefixLength; i--) { + char c = (char) in.readVInt(); + assert c == currentEntry.charAt(i); + pos -= in.readVInt(); + in.setPosition(pos); + } + commonPrefixPos = pos; + } + currentEntry = entry; + } + + group.add(flags); + if (hasCustomMorphData) { + morphDataIDs.add(morphDataID); + } + } + + private int flushGroup() throws IOException { + IntsRefBuilder currentOrds = new IntsRefBuilder(); + + boolean hasNonHidden = false; + for (char[] flags : group) { + if (!hasHiddenFlag(flags)) { + hasNonHidden = true; + break; + } + } + + for (int i = 0; i < group.size(); i++) { + char[] flags = group.get(i); + if (hasNonHidden && hasHiddenFlag(flags)) { + continue; + } + + currentOrds.append(flagEnumerator.add(flags)); + if (hasCustomMorphData) { + currentOrds.append(morphDataIDs.get(i)); + } + } + + int lastPos = commonPrefixPos; + for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) { + int pos = dataWriter.getPosition(); + ensureArraySize(0, false); + dataWriter.writeVInt(currentEntry.charAt(i)); + dataWriter.writeVInt(pos - lastPos); + lastPos = pos; + } + + int pos = dataWriter.getPosition(); + int hash = Math.abs(currentEntry.hashCode() % hashTable.length); + int collision = hashTable[hash]; + hashTable[hash] = collision == 0 ? pos : -pos; + + if (++chainLengths[hash] > 20) { + throw new RuntimeException( + "Too many collisions, please report this to dev@lucene.apache.org"); + } + + ensureArraySize(currentOrds.length(), collision != 0); + dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1)); + dataWriter.writeVInt(pos - lastPos); + IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter); + if (collision != 0) { + dataWriter.writeByte(collision < 0 ? (byte) 1 : 0); + dataWriter.writeVInt(pos - Math.abs(collision)); + } + + group.clear(); + morphDataIDs.clear(); + return pos; + } + + private void ensureArraySize(int valueLength, boolean hasCollision) { + int pos = dataWriter.getPosition(); + int maxEntrySize = 8 + 4 * (valueLength + 1) + (hasCollision ? 5 : 0); + while (wordData.length < pos + maxEntrySize) { + wordData = ArrayUtil.grow(wordData); + dataWriter.reset(wordData, pos, wordData.length - pos); + } + } + + private static boolean hasHiddenFlag(char[] flags) { + for (char flag : flags) { + if (flag == Dictionary.HIDDEN_FLAG) { + return true; + } + } + return false; + } + + WordStorage build() throws IOException { + flushGroup(); + return new WordStorage( + hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition())); + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index 6fac33d2d9ff..acef45bb4e12 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -160,8 +160,7 @@ public void testDictionariesLoadSuccessfully() throws Exception { try { Dictionary dic = loadDictionary(aff); totalMemory.addAndGet(RamUsageTester.sizeOf(dic)); - totalWords.addAndGet( - RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes)); + totalWords.addAndGet(RamUsageTester.sizeOf(dic.words)); System.out.println(aff + "\t" + memoryUsageSummary(dic)); } catch (Throwable e) { failures.add(aff); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index f74654927429..bc69f6c5b812 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -57,12 +57,12 @@ public static void resolveCorpora() { @Test public void en() throws Exception { - checkAnalysisPerformance("en", 1_000_000); + checkAnalysisPerformance("en", 1_200_000); } @Test public void en_suggest() throws Exception { - checkSuggestionPerformance("en", 1_200); + checkSuggestionPerformance("en", 3_000); } @Test @@ -72,7 +72,7 @@ public void de() throws Exception { @Test public void de_suggest() throws Exception { - checkSuggestionPerformance("de", 55); + checkSuggestionPerformance("de", 60); } @Test From fb9805f4d3439b8e020a7e88abe301c3038345b3 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 5 Mar 2021 16:17:04 +0100 Subject: [PATCH 2/8] update comment --- .../apache/lucene/analysis/hunspell/GeneratingSuggester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 3c508ba5f108..68af022011ad 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -71,7 +71,7 @@ private List>> findSimilarDictionaryEntries( assert rootChars.length > 0; if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) { - assert rootChars.length < word.length(); // nextKey takes care of longer keys + assert rootChars.length < word.length(); // processAllWords takes care of longer keys return; } From 7b048c5610dcf3077c000a5a4f84230e635c5106 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 5 Mar 2021 16:21:19 +0100 Subject: [PATCH 3/8] make fields private --- .../apache/lucene/analysis/hunspell/WordStorage.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 1f931c96946c..36e9f3fb1e03 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -204,16 +204,16 @@ private void readForms(IntsRef forms, ByteArrayDataInput in, int length) { static class Builder { private final boolean hasCustomMorphData; private final int[] hashTable; - private byte[] wordData; private final int[] chainLengths; + private final FlagEnumerator flagEnumerator; + private final ByteArrayDataOutput dataWriter; + + private byte[] wordData; + private int commonPrefixLength, commonPrefixPos; + private String currentEntry = null; private final List group = new ArrayList<>(); private final List morphDataIDs = new ArrayList<>(); - private String currentEntry = null; - private final FlagEnumerator flagEnumerator; - - private final ByteArrayDataOutput dataWriter; - int commonPrefixLength, commonPrefixPos; Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { this.flagEnumerator = flagEnumerator; From 469cfc67d4ecfdc390b4b3e245286026b2e764e4 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 8 Mar 2021 08:49:27 +0100 Subject: [PATCH 4/8] fix lookupWord false positive --- .../java/org/apache/lucene/analysis/hunspell/WordStorage.java | 2 +- .../org/apache/lucene/analysis/hunspell/TestDictionary.java | 4 ++++ .../src/test/org/apache/lucene/analysis/hunspell/simple.dic | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 36e9f3fb1e03..c8bd3c56f961 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -191,7 +191,7 @@ private boolean isSameString( return i == 0; } } - return length == 0; + return length == 0 && dataPos == 0; } private void readForms(IntsRef forms, ByteArrayDataInput in, int length) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 1b64c3a9d319..9c7a0e06bba7 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -50,6 +50,10 @@ public void testSimpleDictionary() throws Exception { assertNotNull(ordList); assertEquals(1, ordList.length); assertEquals('A', assertSingleFlag(dictionary, ordList)); + + assertNotNull(dictionary.lookupWord(new char[] {'a', 'b'}, 0, 2)); + assertNotNull(dictionary.lookupWord(new char[] {'d', 'b'}, 0, 2)); + assertNull(dictionary.lookupWord(new char[] {'b'}, 0, 1)); } private static char assertSingleFlag(Dictionary dictionary, IntsRef ordList) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic index f7bbab3ba676..2809611b8764 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic @@ -8,3 +8,4 @@ lucene mahout/A moo/E olr/B +db \ No newline at end of file From e69390b268e34f0dc92293640f9bf3cac219612d Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 8 Mar 2021 08:56:10 +0100 Subject: [PATCH 5/8] don't lookup empty stems after stripping the whole word --- .../src/java/org/apache/lucene/analysis/hunspell/Stemmer.java | 2 ++ .../org/apache/lucene/analysis/hunspell/TestFullStrip.java | 2 ++ .../src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff | 3 +++ .../src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic | 3 ++- 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 40ad17c609fd..488adfd2b445 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -488,6 +488,8 @@ private char[] stripAffix( int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLen = stripEnd - stripStart; + if (stripLen + deAffixedLen == 0) return null; + char[] stripData = dictionary.stripData; int condition = dictionary.getAffixCondition(affix); if (condition != 0) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java index 3634a83c9fa7..7f48d7d6285b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java @@ -26,5 +26,7 @@ public static void beforeClass() throws Exception { public void testStemming() { assertStemsTo("tasty", "beer"); + assertStemsTo("as", "a"); + assertStemsTo("s"); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff index 9c2de7f7cff0..2cf00cd9d30f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff @@ -4,3 +4,6 @@ FULLSTRIP SFX A Y 1 SFX A beer tasty . + +SFX S Y 1 +SFX S 0 s . \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic index c948f1846352..8f594e355aff 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic @@ -1,2 +1,3 @@ -1 +2 beer/A +a/S \ No newline at end of file From e28b50bae85f4b896b3c31f4bc80a814c988e99b Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 8 Mar 2021 09:43:40 +0100 Subject: [PATCH 6/8] add/fix WordStorage comments --- .../lucene/analysis/hunspell/WordStorage.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index c8bd3c56f961..81a818616665 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -41,7 +41,7 @@ * * * There's only one entry for each prefix, so it's like a trie/{@link - * org.apache.lucene.util.fst.FST}, but a reversed one: each nodes points to a single previous nodes + * org.apache.lucene.util.fst.FST}, but a reversed one: each node points to a single previous node * instead of several following ones. For example, "abc" and "abd" point to the same prefix entry * "ab" which points to "a" which points to 0.
*
@@ -60,9 +60,9 @@ class WordStorage { * *
    *
  • VINT: the word's last character - *
  • VINT: pointer to the entry for the same word without the last character. It's relative: - * the difference of this entry's start and the prefix's entry start. 0 for single-character - * entries + *
  • VINT: a delta pointer to the entry for the same word without the last character. + * Precisely, it's the difference of this entry's start and the prefix's entry start. 0 for + * single-character entries *
  • Optional, for non-leaf entries only: *
      *
    • VINT: the length of the word form data, returned from {@link #lookupWord} @@ -71,7 +71,7 @@ class WordStorage { *
        *
      • BYTE: 1 if the next collision entry has further collisions, 0 if it's the * last of the entries with the same hash - *
      • VINT: (relative) pointer to the previous entry with the same hash + *
      • VINT: (delta) pointer to the previous entry with the same hash *
      *
    *
@@ -129,6 +129,8 @@ private static void skipVInts(ByteArrayDataInput in, int count) { } /** + * @param maxLength the limit on the length of words to be processed, the callback won't be + * invoked for the longer ones * @param processor is invoked for each word. Note that the passed arguments (word and form) are * reused, so they can be modified in any way, but may not be saved for later by the processor */ @@ -215,6 +217,10 @@ static class Builder { private final List group = new ArrayList<>(); private final List morphDataIDs = new ArrayList<>(); + /** + * @param wordCount an approximate number of the words in the resulting dictionary, used to + * pre-size the hash table + */ Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { this.flagEnumerator = flagEnumerator; this.hasCustomMorphData = hasCustomMorphData; @@ -227,6 +233,10 @@ static class Builder { chainLengths = new int[hashTable.length]; } + /** + * Add a dictionary entry. This method should be called for entries sorted non-descending by + * {@link String#compareTo} rules. + */ void add(String entry, char[] flags, int morphDataID) throws IOException { if (!entry.equals(currentEntry)) { if (currentEntry != null) { @@ -278,6 +288,7 @@ private int flushGroup() throws IOException { } } + // write the non-leaf entries for chars after the shared prefix, except the last one int lastPos = commonPrefixPos; for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) { int pos = dataWriter.getPosition(); @@ -297,6 +308,7 @@ private int flushGroup() throws IOException { "Too many collisions, please report this to dev@lucene.apache.org"); } + // write the leaf entry for the last character ensureArraySize(currentOrds.length(), collision != 0); dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1)); dataWriter.writeVInt(pos - lastPos); From f9cd8e5c80eabb969e3cca3837ca31c5383e51dd Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 8 Mar 2021 10:09:41 +0100 Subject: [PATCH 7/8] fix processAllWords, add a test --- .../lucene/analysis/hunspell/WordStorage.java | 2 +- .../analysis/hunspell/TestDictionary.java | 46 +++++++++++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 81a818616665..e25a41cb8934 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -163,7 +163,7 @@ void processAllWords(int maxLength, BiConsumer processor) { prevPos -= in.readVInt(); } - if (wordStart > 0) { + if (prevPos == 0) { chars.offset = wordStart; chars.length = maxLength - wordStart; processor.accept(chars, forms); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 9c7a0e06bba7..88a515fdbf21 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -16,14 +16,19 @@ */ package org.apache.lucene.analysis.hunspell; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.StandardCharsets; +import java.io.InputStreamReader; import java.text.ParseException; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -63,6 +68,31 @@ private static char assertSingleFlag(Dictionary dictionary, IntsRef ordList) { return flags[0]; } + public void testProcessAllWords() throws Exception { + Dictionary dictionary = loadDictionary("simple.aff", "simple.dic"); + + try (InputStream stream = getClass().getResourceAsStream("simple.dic")) { + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); + Set allWords = + reader.lines().skip(1).map(s -> s.split("/")[0]).collect(Collectors.toSet()); + int maxLength = allWords.stream().mapToInt(String::length).max().orElseThrow(); + + for (int i = 1; i <= maxLength + 1; i++) { + checkProcessWords(dictionary, allWords, i); + } + } + } + + private void checkProcessWords(Dictionary dictionary, Set allWords, int maxLength) { + Set processed = new HashSet<>(); + dictionary.words.processAllWords(maxLength, (word, __) -> processed.add(word.toString())); + + Set filtered = + allWords.stream().filter(s -> s.length() <= maxLength).collect(Collectors.toSet()); + + assertEquals("For length " + maxLength, filtered, processed); + } + public void testCompressedDictionary() throws Exception { Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic"); assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length); @@ -96,8 +126,8 @@ public void testInvalidData() { } public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException { - byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8); - byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8); + byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(UTF_8); + byte[] dic = "1\nfoo/42".getBytes(UTF_8); Dictionary dictionary = new Dictionary( @@ -210,14 +240,14 @@ private static String getDictionaryEncoding(String affFile) throws IOException, new Dictionary( new ByteBuffersDirectory(), "", - new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)), - new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8))); + new ByteArrayInputStream(affFile.getBytes(UTF_8)), + new ByteArrayInputStream("1\nmock".getBytes(UTF_8))); return dictionary.decoder.charset().name(); } public void testFlagWithCrazyWhitespace() { - assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", StandardCharsets.UTF_8)); - assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8", StandardCharsets.UTF_8)); + assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", UTF_8)); + assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8", UTF_8)); } @Test @@ -226,7 +256,7 @@ public void testUtf8Flag() { Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", Dictionary.DEFAULT_CHARSET); String src = "привет"; - String asAscii = new String(src.getBytes(StandardCharsets.UTF_8), Dictionary.DEFAULT_CHARSET); + String asAscii = new String(src.getBytes(UTF_8), Dictionary.DEFAULT_CHARSET); assertNotEquals(src, asAscii); assertEquals(src, new String(strategy.parseFlags(asAscii))); } From 4959886c25833c303b1d1ea0bb31480e6880a885 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 8 Mar 2021 10:31:56 +0100 Subject: [PATCH 8/8] fix minor review comments --- .../lucene/analysis/hunspell/WordStorage.java | 49 +++++++++++-------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index e25a41cb8934..1eaf1bb8dc96 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -206,29 +206,45 @@ private void readForms(IntsRef forms, ByteArrayDataInput in, int length) { static class Builder { private final boolean hasCustomMorphData; private final int[] hashTable; - private final int[] chainLengths; - private final FlagEnumerator flagEnumerator; - private final ByteArrayDataOutput dataWriter; - private byte[] wordData; + private final int[] chainLengths; - private int commonPrefixLength, commonPrefixPos; - private String currentEntry = null; + private final IntsRefBuilder currentOrds = new IntsRefBuilder(); private final List group = new ArrayList<>(); private final List morphDataIDs = new ArrayList<>(); + private String currentEntry = null; + private final int wordCount; + private final FlagEnumerator flagEnumerator; + + private final ByteArrayDataOutput dataWriter; + private int commonPrefixLength, commonPrefixPos; + private int actualWords; /** * @param wordCount an approximate number of the words in the resulting dictionary, used to - * pre-size the hash table + * pre-size the hash table. This argument can be a bit larger than the actual word count, + * but not smaller. */ Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { + this.wordCount = wordCount; this.flagEnumerator = flagEnumerator; this.hasCustomMorphData = hasCustomMorphData; hashTable = new int[wordCount]; wordData = new byte[wordCount * 6]; - dataWriter = new ByteArrayDataOutput(wordData); + dataWriter = + new ByteArrayDataOutput(wordData) { + @Override + public void writeByte(byte b) { + int pos = getPosition(); + if (pos == wordData.length) { + wordData = ArrayUtil.grow(wordData); + reset(wordData, pos, wordData.length - pos); + } + super.writeByte(b); + } + }; dataWriter.writeByte((byte) 0); // zero index is root, contains nothing chainLengths = new int[hashTable.length]; } @@ -266,8 +282,11 @@ void add(String entry, char[] flags, int morphDataID) throws IOException { } private int flushGroup() throws IOException { - IntsRefBuilder currentOrds = new IntsRefBuilder(); + if (++actualWords > wordCount) { + throw new RuntimeException("Don't add more words than wordCount!"); + } + currentOrds.clear(); boolean hasNonHidden = false; for (char[] flags : group) { if (!hasHiddenFlag(flags)) { @@ -292,7 +311,6 @@ private int flushGroup() throws IOException { int lastPos = commonPrefixPos; for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) { int pos = dataWriter.getPosition(); - ensureArraySize(0, false); dataWriter.writeVInt(currentEntry.charAt(i)); dataWriter.writeVInt(pos - lastPos); lastPos = pos; @@ -309,7 +327,6 @@ private int flushGroup() throws IOException { } // write the leaf entry for the last character - ensureArraySize(currentOrds.length(), collision != 0); dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1)); dataWriter.writeVInt(pos - lastPos); IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter); @@ -323,15 +340,6 @@ private int flushGroup() throws IOException { return pos; } - private void ensureArraySize(int valueLength, boolean hasCollision) { - int pos = dataWriter.getPosition(); - int maxEntrySize = 8 + 4 * (valueLength + 1) + (hasCollision ? 5 : 0); - while (wordData.length < pos + maxEntrySize) { - wordData = ArrayUtil.grow(wordData); - dataWriter.reset(wordData, pos, wordData.length - pos); - } - } - private static boolean hasHiddenFlag(char[] flags) { for (char flag : flags) { if (flag == Dictionary.HIDDEN_FLAG) { @@ -342,6 +350,7 @@ private static boolean hasHiddenFlag(char[] flags) { } WordStorage build() throws IOException { + assert !group.isEmpty() : "build() should be only called once"; flushGroup(); return new WordStorage( hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));