LUCENE-10393: Unify binary dictionary and dictionary writer in kuromo…

…ji and nori (#740)
apache · Mar 25, 2022 · bd22f19 · bd22f19
1 parent b3906e9
commit bd22f19
Show file tree

Hide file tree

Showing 62 changed files with 1,821 additions and 1,492 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -49,6 +49,9 @@ Other
 * LUCENE-10253: The @BadApple annotation has been removed from the test
   framework. (Adrien Grand)
 
+* LUCENE-10393: Unify binary dictionary and dictionary writer in Kuromoji and Nori.
+  (Tomoko Uchida, Robert Muir)
+
 ======================= Lucene 9.2.0 =======================
 API Changes
 ---------------------

diff --git a/lucene/analysis/common/src/java/module-info.java b/lucene/analysis/common/src/java/module-info.java
@@ -60,6 +60,7 @@
   exports org.apache.lucene.analysis.lv;
   exports org.apache.lucene.analysis.minhash;
   exports org.apache.lucene.analysis.miscellaneous;
+  exports org.apache.lucene.analysis.morph;
   exports org.apache.lucene.analysis.ne;
   exports org.apache.lucene.analysis.ngram;
   exports org.apache.lucene.analysis.nl;

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOSupplier;
+import org.apache.lucene.util.IntsRef;
+
+/** Abstract dictionary base class. */
+public abstract class BinaryDictionary<T extends MorphData> implements Dictionary<T> {
+  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
+  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+
+  private final int[] targetMapOffsets, targetMap;
+  protected final ByteBuffer buffer;
+
+  protected BinaryDictionary(
+      IOSupplier<InputStream> targetMapResource,
+      IOSupplier<InputStream> dictResource,
+      String targetMapCodecHeader,
+      String dictCodecHeader,
+      int dictCodecVersion)
+      throws IOException {
+    try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
+      final DataInput in = new InputStreamDataInput(mapIS);
+      CodecUtil.checkHeader(in, targetMapCodecHeader, dictCodecVersion, dictCodecVersion);
+      this.targetMap = new int[in.readVInt()];
+      this.targetMapOffsets = new int[in.readVInt()];
+      populateTargetMap(in, this.targetMap, this.targetMapOffsets);
+    }
+
+    // no buffering here, as we load in one large buffer
+    try (InputStream dictIS = dictResource.get()) {
+      final DataInput in = new InputStreamDataInput(dictIS);
+      CodecUtil.checkHeader(in, dictCodecHeader, dictCodecVersion, dictCodecVersion);
+      final int size = in.readVInt();
+      final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+      final ReadableByteChannel channel = Channels.newChannel(dictIS);
+      final int read = channel.read(tmpBuffer);
+      if (read != size) {
+        throw new EOFException("Cannot read whole dictionary");
+      }
+      this.buffer = tmpBuffer.asReadOnlyBuffer();
+    }
+  }
+
+  private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
+      throws IOException {
+    int accum = 0, sourceId = 0;
+    for (int ofs = 0; ofs < targetMap.length; ofs++) {
+      final int val = in.readVInt();
+      if ((val & 0x01) != 0) {
+        targetMapOffsets[sourceId] = ofs;
+        sourceId++;
+      }
+      accum += val >>> 1;
+      targetMap[ofs] = accum;
+    }
+    if (sourceId + 1 != targetMapOffsets.length)
+      throw new IOException(
+          "targetMap file format broken; targetMap.length="
+              + targetMap.length
+              + ", targetMapOffsets.length="
+              + targetMapOffsets.length
+              + ", sourceId="
+              + sourceId);
+    targetMapOffsets[sourceId] = targetMap.length;
+  }
+
+  public void lookupWordIds(int sourceId, IntsRef ref) {
+    ref.ints = targetMap;
+    ref.offset = targetMapOffsets[sourceId];
+    // targetMapOffsets always has one more entry pointing behind last:
+    ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.ArrayUtil;
+
+/** Abstract base dictionary writer class. */
+public abstract class BinaryDictionaryWriter<T extends BinaryDictionary<? extends MorphData>> {
+  private final Class<T> implClazz;
+  private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
+  private int[] targetMap = new int[8192];
+  private int[] targetMapOffsets = new int[8192];
+  protected final DictionaryEntryWriter entryWriter;
+
+  protected BinaryDictionaryWriter(Class<T> implClazz, DictionaryEntryWriter entryWriter) {
+    this.implClazz = implClazz;
+    this.entryWriter = entryWriter;
+  }
+
+  /**
+   * put the entry in map
+   *
+   * @return current position of buffer, which will be wordId of next entry
+   */
+  public int put(String[] entry) {
+    return entryWriter.putEntry(entry);
+  }
+
+  /**
+   * Write whole dictionary in a directory.
+   *
+   * @throws IOException if an I/O error occurs writing the dictionary files
+   */
+  public abstract void write(Path baseDir) throws IOException;
+
+  protected void addMapping(int sourceId, int wordId) {
+    if (wordId <= lastWordId) {
+      throw new IllegalStateException(
+          "words out of order: " + wordId + " vs lastID: " + lastWordId);
+    }
+
+    if (sourceId > lastSourceId) {
+      targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
+      for (int i = lastSourceId + 1; i <= sourceId; i++) {
+        targetMapOffsets[i] = targetMapEndOffset;
+      }
+    } else if (sourceId != lastSourceId) {
+      throw new IllegalStateException(
+          "source ids not in increasing order: lastSourceId="
+              + lastSourceId
+              + " vs sourceId="
+              + sourceId);
+    }
+
+    targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
+    targetMap[targetMapEndOffset] = wordId;
+    targetMapEndOffset++;
+
+    lastSourceId = sourceId;
+    lastWordId = wordId;
+  }
+
+  /**
+   * Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left
+   * id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}],
+   * [entry...], [entry...].....
+   *
+   * @throws IOException if an I/O error occurs writing the dictionary files
+   */
+  protected void write(
+      Path baseDir,
+      String targetMapCodecHeader,
+      String posDictCodecHeader,
+      String dictCodecHeader,
+      int dictCodecVersion)
+      throws IOException {
+    final String baseName = getBaseFileName();
+    entryWriter.writeDictionary(
+        baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX),
+        dictCodecHeader,
+        dictCodecVersion);
+    entryWriter.writePosDict(
+        baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX),
+        posDictCodecHeader,
+        dictCodecVersion);
+    writeTargetMap(
+        baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX),
+        targetMapCodecHeader,
+        dictCodecVersion);
+  }
+
+  protected final String getBaseFileName() {
+    return implClazz.getName().replace('.', '/');
+  }
+
+  // TODO: maybe this int[] should instead be the output to the FST...
+  private void writeTargetMap(Path path, String targetMapCodecHeader, int dictCodecVersion)
+      throws IOException {
+    Files.createDirectories(path.getParent());
+    try (OutputStream os = Files.newOutputStream(path);
+        OutputStream bos = new BufferedOutputStream(os)) {
+      final DataOutput out = new OutputStreamDataOutput(bos);
+      CodecUtil.writeHeader(out, targetMapCodecHeader, dictCodecVersion);
+
+      final int numSourceIds = lastSourceId + 1;
+      out.writeVInt(targetMapEndOffset); // <-- size of main array
+      out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry)
+      int prev = 0, sourceId = 0;
+      for (int ofs = 0; ofs < targetMapEndOffset; ofs++) {
+        final int val = targetMap[ofs], delta = val - prev;
+        assert delta >= 0;
+        if (ofs == targetMapOffsets[sourceId]) {
+          out.writeVInt((delta << 1) | 0x01);
+          sourceId++;
+        } else {
+          out.writeVInt((delta << 1));
+        }
+        prev += delta;
+      }
+      if (sourceId != numSourceIds) {
+        throw new IllegalStateException(
+            "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
+      }
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOSupplier;
+
+/** Character category data. */
+public abstract class CharacterDefinition {
+
+  public static final String FILENAME_SUFFIX = ".dat";
+
+  protected final byte[] characterCategoryMap = new byte[0x10000];
+  private final boolean[] invokeMap;
+  private final boolean[] groupMap;
+
+  protected CharacterDefinition(
+      IOSupplier<InputStream> charDefResource,
+      String charDefCodecHeader,
+      int charDefCodecVersion,
+      int classCount)
+      throws IOException {
+    try (InputStream is = charDefResource.get()) {
+      final DataInput in = new InputStreamDataInput(is);
+      CodecUtil.checkHeader(in, charDefCodecHeader, charDefCodecVersion, charDefCodecVersion);
+      in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
+      this.invokeMap = new boolean[classCount];
+      this.groupMap = new boolean[classCount];
+      for (int i = 0; i < classCount; i++) {
+        final byte b = in.readByte();
+        invokeMap[i] = (b & 0x01) != 0;
+        groupMap[i] = (b & 0x02) != 0;
+      }
+    }
+  }
+
+  public byte getCharacterClass(char c) {
+    return characterCategoryMap[c];
+  }
+
+  public boolean isInvoke(char c) {
+    return invokeMap[characterCategoryMap[c]];
+  }
+
+  public boolean isGroup(char c) {
+    return groupMap[characterCategoryMap[c]];
+  }
+
+  /** Functional interface to lookup character class */
+  @FunctionalInterface
+  public interface LookupCharacterClass {
+    /** looks up character class for given class name */
+    byte lookupCharacterClass(String characterClassName);
+  }
+}