Skip to content

Commit

Permalink
LUCENE-10393: Unify binary dictionary and dictionary writer in kuromo…
Browse files Browse the repository at this point in the history
…ji and nori (#740)
  • Loading branch information
mocobeta authored Mar 25, 2022
1 parent b3906e9 commit bd22f19
Show file tree
Hide file tree
Showing 62 changed files with 1,821 additions and 1,492 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Other
* LUCENE-10253: The @BadApple annotation has been removed from the test
framework. (Adrien Grand)

* LUCENE-10393: Unify binary dictionary and dictionary writer in Kuromoji and Nori.
(Tomoko Uchida, Robert Muir)

======================= Lucene 9.2.0 =======================
API Changes
---------------------
Expand Down
1 change: 1 addition & 0 deletions lucene/analysis/common/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
exports org.apache.lucene.analysis.lv;
exports org.apache.lucene.analysis.minhash;
exports org.apache.lucene.analysis.miscellaneous;
exports org.apache.lucene.analysis.morph;
exports org.apache.lucene.analysis.ne;
exports org.apache.lucene.analysis.ngram;
exports org.apache.lucene.analysis.nl;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IntsRef;

/** Abstract dictionary base class. */
public abstract class BinaryDictionary<T extends MorphData> implements Dictionary<T> {
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";

private final int[] targetMapOffsets, targetMap;
protected final ByteBuffer buffer;

protected BinaryDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> dictResource,
String targetMapCodecHeader,
String dictCodecHeader,
int dictCodecVersion)
throws IOException {
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
final DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, targetMapCodecHeader, dictCodecVersion, dictCodecVersion);
this.targetMap = new int[in.readVInt()];
this.targetMapOffsets = new int[in.readVInt()];
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
}

// no buffering here, as we load in one large buffer
try (InputStream dictIS = dictResource.get()) {
final DataInput in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, dictCodecHeader, dictCodecVersion, dictCodecVersion);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
final ReadableByteChannel channel = Channels.newChannel(dictIS);
final int read = channel.read(tmpBuffer);
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
this.buffer = tmpBuffer.asReadOnlyBuffer();
}
}

private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
throws IOException {
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
}

public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
// targetMapOffsets always has one more entry pointing behind last:
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;

/** Abstract base dictionary writer class. */
public abstract class BinaryDictionaryWriter<T extends BinaryDictionary<? extends MorphData>> {
private final Class<T> implClazz;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
protected final DictionaryEntryWriter entryWriter;

protected BinaryDictionaryWriter(Class<T> implClazz, DictionaryEntryWriter entryWriter) {
this.implClazz = implClazz;
this.entryWriter = entryWriter;
}

/**
* put the entry in map
*
* @return current position of buffer, which will be wordId of next entry
*/
public int put(String[] entry) {
return entryWriter.putEntry(entry);
}

/**
* Write whole dictionary in a directory.
*
* @throws IOException if an I/O error occurs writing the dictionary files
*/
public abstract void write(Path baseDir) throws IOException;

protected void addMapping(int sourceId, int wordId) {
if (wordId <= lastWordId) {
throw new IllegalStateException(
"words out of order: " + wordId + " vs lastID: " + lastWordId);
}

if (sourceId > lastSourceId) {
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
for (int i = lastSourceId + 1; i <= sourceId; i++) {
targetMapOffsets[i] = targetMapEndOffset;
}
} else if (sourceId != lastSourceId) {
throw new IllegalStateException(
"source ids not in increasing order: lastSourceId="
+ lastSourceId
+ " vs sourceId="
+ sourceId);
}

targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
targetMap[targetMapEndOffset] = wordId;
targetMapEndOffset++;

lastSourceId = sourceId;
lastWordId = wordId;
}

/**
* Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left
* id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}],
* [entry...], [entry...].....
*
* @throws IOException if an I/O error occurs writing the dictionary files
*/
protected void write(
Path baseDir,
String targetMapCodecHeader,
String posDictCodecHeader,
String dictCodecHeader,
int dictCodecVersion)
throws IOException {
final String baseName = getBaseFileName();
entryWriter.writeDictionary(
baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX),
dictCodecHeader,
dictCodecVersion);
entryWriter.writePosDict(
baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX),
posDictCodecHeader,
dictCodecVersion);
writeTargetMap(
baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX),
targetMapCodecHeader,
dictCodecVersion);
}

protected final String getBaseFileName() {
return implClazz.getName().replace('.', '/');
}

// TODO: maybe this int[] should instead be the output to the FST...
private void writeTargetMap(Path path, String targetMapCodecHeader, int dictCodecVersion)
throws IOException {
Files.createDirectories(path.getParent());
try (OutputStream os = Files.newOutputStream(path);
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, targetMapCodecHeader, dictCodecVersion);

final int numSourceIds = lastSourceId + 1;
out.writeVInt(targetMapEndOffset); // <-- size of main array
out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry)
int prev = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMapEndOffset; ofs++) {
final int val = targetMap[ofs], delta = val - prev;
assert delta >= 0;
if (ofs == targetMapOffsets[sourceId]) {
out.writeVInt((delta << 1) | 0x01);
sourceId++;
} else {
out.writeVInt((delta << 1));
}
prev += delta;
}
if (sourceId != numSourceIds) {
throw new IllegalStateException(
"sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;

/** Character category data. */
public abstract class CharacterDefinition {

public static final String FILENAME_SUFFIX = ".dat";

protected final byte[] characterCategoryMap = new byte[0x10000];
private final boolean[] invokeMap;
private final boolean[] groupMap;

protected CharacterDefinition(
IOSupplier<InputStream> charDefResource,
String charDefCodecHeader,
int charDefCodecVersion,
int classCount)
throws IOException {
try (InputStream is = charDefResource.get()) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, charDefCodecHeader, charDefCodecVersion, charDefCodecVersion);
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
this.invokeMap = new boolean[classCount];
this.groupMap = new boolean[classCount];
for (int i = 0; i < classCount; i++) {
final byte b = in.readByte();
invokeMap[i] = (b & 0x01) != 0;
groupMap[i] = (b & 0x02) != 0;
}
}
}

public byte getCharacterClass(char c) {
return characterCategoryMap[c];
}

public boolean isInvoke(char c) {
return invokeMap[characterCategoryMap[c]];
}

public boolean isGroup(char c) {
return groupMap[characterCategoryMap[c]];
}

/** Functional interface to lookup character class */
@FunctionalInterface
public interface LookupCharacterClass {
/** looks up character class for given class name */
byte lookupCharacterClass(String characterClassName);
}
}
Loading

0 comments on commit bd22f19

Please sign in to comment.