Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10393: Unify binary dictionary and dictionary writer in kuromoji and nori #740

Merged
merged 23 commits into from
Mar 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1e2864e
factor out binary dictionary writer/reader for kuromoji
mocobeta Mar 9, 2022
3c0562f
apply to nori.
mocobeta Mar 10, 2022
d220198
lint
mocobeta Mar 10, 2022
1863e25
add javadocs
mocobeta Mar 10, 2022
8323180
rename MorphAttributes to MorphData
mocobeta Mar 12, 2022
b53cd3c
minor refactor on dictionary writer
mocobeta Mar 12, 2022
15e80de
lint
mocobeta Mar 12, 2022
4f4a0f2
ConnectionCosts and ConnectionCostsWriter can be shared in a similar …
mocobeta Mar 14, 2022
7d43514
add write(Path) abstract method to BinaryDictionaryWriter
mocobeta Mar 17, 2022
49bf2a5
CharacterDefinition/CharacterDefinitionWriter can be shared in a simi…
mocobeta Mar 17, 2022
8d0645e
small refactoring on codec header constants
mocobeta Mar 17, 2022
6502d63
remove obsolete comments
mocobeta Mar 17, 2022
b1a0033
reduce visibility of XXXMorphData; and they can be final
mocobeta Mar 17, 2022
5b23148
add javadocs
mocobeta Mar 17, 2022
34cfa68
reduce visibility of DictionaryEntryWriters
mocobeta Mar 17, 2022
38481bc
add documentation about ipadic format
mocobeta Mar 17, 2022
4387d3f
putEntry() also can be protected
mocobeta Mar 17, 2022
67ed016
merge .util package to .dict
mocobeta Mar 19, 2022
6003fae
Revert "merge .util package to .dict"
mocobeta Mar 19, 2022
c069ecb
add module tests for kuromoji and nori
mocobeta Mar 22, 2022
02794d9
add changes entry.
mocobeta Mar 23, 2022
5b09688
Merge branch 'main' into jira/lucene-10393-refine-dictionary-api
mocobeta Mar 23, 2022
a13bc51
Revert "add module tests for kuromoji and nori"
mocobeta Mar 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Other
* LUCENE-10253: The @BadApple annotation has been removed from the test
framework. (Adrien Grand)

* LUCENE-10393: Unify binary dictionary and dictionary writer in Kuromoji and Nori.
(Tomoko Uchida, Robert Muir)

======================= Lucene 9.2.0 =======================
API Changes
---------------------
Expand Down
1 change: 1 addition & 0 deletions lucene/analysis/common/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
exports org.apache.lucene.analysis.lv;
exports org.apache.lucene.analysis.minhash;
exports org.apache.lucene.analysis.miscellaneous;
exports org.apache.lucene.analysis.morph;
exports org.apache.lucene.analysis.ne;
exports org.apache.lucene.analysis.ngram;
exports org.apache.lucene.analysis.nl;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IntsRef;

/** Abstract dictionary base class. */
public abstract class BinaryDictionary<T extends MorphData> implements Dictionary<T> {
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";

private final int[] targetMapOffsets, targetMap;
protected final ByteBuffer buffer;

protected BinaryDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> dictResource,
String targetMapCodecHeader,
String dictCodecHeader,
int dictCodecVersion)
throws IOException {
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
final DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, targetMapCodecHeader, dictCodecVersion, dictCodecVersion);
this.targetMap = new int[in.readVInt()];
this.targetMapOffsets = new int[in.readVInt()];
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
}

// no buffering here, as we load in one large buffer
try (InputStream dictIS = dictResource.get()) {
final DataInput in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, dictCodecHeader, dictCodecVersion, dictCodecVersion);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
final ReadableByteChannel channel = Channels.newChannel(dictIS);
final int read = channel.read(tmpBuffer);
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
this.buffer = tmpBuffer.asReadOnlyBuffer();
}
}

private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
throws IOException {
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
}

public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
// targetMapOffsets always has one more entry pointing behind last:
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;

/** Abstract base dictionary writer class. */
public abstract class BinaryDictionaryWriter<T extends BinaryDictionary<? extends MorphData>> {
private final Class<T> implClazz;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
protected final DictionaryEntryWriter entryWriter;

protected BinaryDictionaryWriter(Class<T> implClazz, DictionaryEntryWriter entryWriter) {
this.implClazz = implClazz;
this.entryWriter = entryWriter;
}

/**
* put the entry in map
*
* @return current position of buffer, which will be wordId of next entry
*/
public int put(String[] entry) {
return entryWriter.putEntry(entry);
}

/**
* Write whole dictionary in a directory.
*
* @throws IOException if an I/O error occurs writing the dictionary files
*/
public abstract void write(Path baseDir) throws IOException;

protected void addMapping(int sourceId, int wordId) {
if (wordId <= lastWordId) {
throw new IllegalStateException(
"words out of order: " + wordId + " vs lastID: " + lastWordId);
}

if (sourceId > lastSourceId) {
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
for (int i = lastSourceId + 1; i <= sourceId; i++) {
targetMapOffsets[i] = targetMapEndOffset;
}
} else if (sourceId != lastSourceId) {
throw new IllegalStateException(
"source ids not in increasing order: lastSourceId="
+ lastSourceId
+ " vs sourceId="
+ sourceId);
}

targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
targetMap[targetMapEndOffset] = wordId;
targetMapEndOffset++;

lastSourceId = sourceId;
lastWordId = wordId;
}

/**
* Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left
* id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}],
* [entry...], [entry...].....
*
* @throws IOException if an I/O error occurs writing the dictionary files
*/
protected void write(
Path baseDir,
String targetMapCodecHeader,
String posDictCodecHeader,
String dictCodecHeader,
int dictCodecVersion)
throws IOException {
final String baseName = getBaseFileName();
entryWriter.writeDictionary(
baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX),
dictCodecHeader,
dictCodecVersion);
entryWriter.writePosDict(
baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX),
posDictCodecHeader,
dictCodecVersion);
writeTargetMap(
baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX),
targetMapCodecHeader,
dictCodecVersion);
}

protected final String getBaseFileName() {
return implClazz.getName().replace('.', '/');
}

// TODO: maybe this int[] should instead be the output to the FST...
private void writeTargetMap(Path path, String targetMapCodecHeader, int dictCodecVersion)
throws IOException {
Files.createDirectories(path.getParent());
try (OutputStream os = Files.newOutputStream(path);
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, targetMapCodecHeader, dictCodecVersion);

final int numSourceIds = lastSourceId + 1;
out.writeVInt(targetMapEndOffset); // <-- size of main array
out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry)
int prev = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMapEndOffset; ofs++) {
final int val = targetMap[ofs], delta = val - prev;
assert delta >= 0;
if (ofs == targetMapOffsets[sourceId]) {
out.writeVInt((delta << 1) | 0x01);
sourceId++;
} else {
out.writeVInt((delta << 1));
}
prev += delta;
}
if (sourceId != numSourceIds) {
throw new IllegalStateException(
"sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;

import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;

/** Character category data. */
public abstract class CharacterDefinition {

public static final String FILENAME_SUFFIX = ".dat";

protected final byte[] characterCategoryMap = new byte[0x10000];
private final boolean[] invokeMap;
private final boolean[] groupMap;

protected CharacterDefinition(
IOSupplier<InputStream> charDefResource,
String charDefCodecHeader,
int charDefCodecVersion,
int classCount)
throws IOException {
try (InputStream is = charDefResource.get()) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, charDefCodecHeader, charDefCodecVersion, charDefCodecVersion);
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
this.invokeMap = new boolean[classCount];
this.groupMap = new boolean[classCount];
for (int i = 0; i < classCount; i++) {
final byte b = in.readByte();
invokeMap[i] = (b & 0x01) != 0;
groupMap[i] = (b & 0x02) != 0;
}
}
}

public byte getCharacterClass(char c) {
return characterCategoryMap[c];
}

public boolean isInvoke(char c) {
return invokeMap[characterCategoryMap[c]];
}

public boolean isGroup(char c) {
return groupMap[characterCategoryMap[c]];
}

/** Functional interface to lookup character class */
@FunctionalInterface
public interface LookupCharacterClass {
/** looks up character class for given class name */
byte lookupCharacterClass(String characterClassName);
}
}
Loading