-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LUCENE-10393: Unify binary dictionary and dictionary writer in kuromo…
…ji and nori (#740)
- Loading branch information
Showing
62 changed files
with
1,821 additions
and
1,492 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 100 additions & 0 deletions
100
lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.morph; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.EOFException; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.nio.ByteBuffer; | ||
import java.nio.channels.Channels; | ||
import java.nio.channels.ReadableByteChannel; | ||
import org.apache.lucene.codecs.CodecUtil; | ||
import org.apache.lucene.store.DataInput; | ||
import org.apache.lucene.store.InputStreamDataInput; | ||
import org.apache.lucene.util.IOSupplier; | ||
import org.apache.lucene.util.IntsRef; | ||
|
||
/** Abstract dictionary base class. */ | ||
public abstract class BinaryDictionary<T extends MorphData> implements Dictionary<T> { | ||
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; | ||
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; | ||
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; | ||
|
||
private final int[] targetMapOffsets, targetMap; | ||
protected final ByteBuffer buffer; | ||
|
||
protected BinaryDictionary( | ||
IOSupplier<InputStream> targetMapResource, | ||
IOSupplier<InputStream> dictResource, | ||
String targetMapCodecHeader, | ||
String dictCodecHeader, | ||
int dictCodecVersion) | ||
throws IOException { | ||
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) { | ||
final DataInput in = new InputStreamDataInput(mapIS); | ||
CodecUtil.checkHeader(in, targetMapCodecHeader, dictCodecVersion, dictCodecVersion); | ||
this.targetMap = new int[in.readVInt()]; | ||
this.targetMapOffsets = new int[in.readVInt()]; | ||
populateTargetMap(in, this.targetMap, this.targetMapOffsets); | ||
} | ||
|
||
// no buffering here, as we load in one large buffer | ||
try (InputStream dictIS = dictResource.get()) { | ||
final DataInput in = new InputStreamDataInput(dictIS); | ||
CodecUtil.checkHeader(in, dictCodecHeader, dictCodecVersion, dictCodecVersion); | ||
final int size = in.readVInt(); | ||
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); | ||
final ReadableByteChannel channel = Channels.newChannel(dictIS); | ||
final int read = channel.read(tmpBuffer); | ||
if (read != size) { | ||
throw new EOFException("Cannot read whole dictionary"); | ||
} | ||
this.buffer = tmpBuffer.asReadOnlyBuffer(); | ||
} | ||
} | ||
|
||
private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets) | ||
throws IOException { | ||
int accum = 0, sourceId = 0; | ||
for (int ofs = 0; ofs < targetMap.length; ofs++) { | ||
final int val = in.readVInt(); | ||
if ((val & 0x01) != 0) { | ||
targetMapOffsets[sourceId] = ofs; | ||
sourceId++; | ||
} | ||
accum += val >>> 1; | ||
targetMap[ofs] = accum; | ||
} | ||
if (sourceId + 1 != targetMapOffsets.length) | ||
throw new IOException( | ||
"targetMap file format broken; targetMap.length=" | ||
+ targetMap.length | ||
+ ", targetMapOffsets.length=" | ||
+ targetMapOffsets.length | ||
+ ", sourceId=" | ||
+ sourceId); | ||
targetMapOffsets[sourceId] = targetMap.length; | ||
} | ||
|
||
public void lookupWordIds(int sourceId, IntsRef ref) { | ||
ref.ints = targetMap; | ||
ref.offset = targetMapOffsets[sourceId]; | ||
// targetMapOffsets always has one more entry pointing behind last: | ||
ref.length = targetMapOffsets[sourceId + 1] - ref.offset; | ||
} | ||
} |
148 changes: 148 additions & 0 deletions
148
lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.morph; | ||
|
||
import java.io.BufferedOutputStream; | ||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import org.apache.lucene.codecs.CodecUtil; | ||
import org.apache.lucene.store.DataOutput; | ||
import org.apache.lucene.store.OutputStreamDataOutput; | ||
import org.apache.lucene.util.ArrayUtil; | ||
|
||
/** Abstract base dictionary writer class. */ | ||
public abstract class BinaryDictionaryWriter<T extends BinaryDictionary<? extends MorphData>> { | ||
private final Class<T> implClazz; | ||
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; | ||
private int[] targetMap = new int[8192]; | ||
private int[] targetMapOffsets = new int[8192]; | ||
protected final DictionaryEntryWriter entryWriter; | ||
|
||
protected BinaryDictionaryWriter(Class<T> implClazz, DictionaryEntryWriter entryWriter) { | ||
this.implClazz = implClazz; | ||
this.entryWriter = entryWriter; | ||
} | ||
|
||
/** | ||
* put the entry in map | ||
* | ||
* @return current position of buffer, which will be wordId of next entry | ||
*/ | ||
public int put(String[] entry) { | ||
return entryWriter.putEntry(entry); | ||
} | ||
|
||
/** | ||
* Write whole dictionary in a directory. | ||
* | ||
* @throws IOException if an I/O error occurs writing the dictionary files | ||
*/ | ||
public abstract void write(Path baseDir) throws IOException; | ||
|
||
protected void addMapping(int sourceId, int wordId) { | ||
if (wordId <= lastWordId) { | ||
throw new IllegalStateException( | ||
"words out of order: " + wordId + " vs lastID: " + lastWordId); | ||
} | ||
|
||
if (sourceId > lastSourceId) { | ||
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1); | ||
for (int i = lastSourceId + 1; i <= sourceId; i++) { | ||
targetMapOffsets[i] = targetMapEndOffset; | ||
} | ||
} else if (sourceId != lastSourceId) { | ||
throw new IllegalStateException( | ||
"source ids not in increasing order: lastSourceId=" | ||
+ lastSourceId | ||
+ " vs sourceId=" | ||
+ sourceId); | ||
} | ||
|
||
targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1); | ||
targetMap[targetMapEndOffset] = wordId; | ||
targetMapEndOffset++; | ||
|
||
lastSourceId = sourceId; | ||
lastWordId = wordId; | ||
} | ||
|
||
/** | ||
* Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left | ||
* id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], | ||
* [entry...], [entry...]..... | ||
* | ||
* @throws IOException if an I/O error occurs writing the dictionary files | ||
*/ | ||
protected void write( | ||
Path baseDir, | ||
String targetMapCodecHeader, | ||
String posDictCodecHeader, | ||
String dictCodecHeader, | ||
int dictCodecVersion) | ||
throws IOException { | ||
final String baseName = getBaseFileName(); | ||
entryWriter.writeDictionary( | ||
baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX), | ||
dictCodecHeader, | ||
dictCodecVersion); | ||
entryWriter.writePosDict( | ||
baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX), | ||
posDictCodecHeader, | ||
dictCodecVersion); | ||
writeTargetMap( | ||
baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX), | ||
targetMapCodecHeader, | ||
dictCodecVersion); | ||
} | ||
|
||
protected final String getBaseFileName() { | ||
return implClazz.getName().replace('.', '/'); | ||
} | ||
|
||
// TODO: maybe this int[] should instead be the output to the FST... | ||
private void writeTargetMap(Path path, String targetMapCodecHeader, int dictCodecVersion) | ||
throws IOException { | ||
Files.createDirectories(path.getParent()); | ||
try (OutputStream os = Files.newOutputStream(path); | ||
OutputStream bos = new BufferedOutputStream(os)) { | ||
final DataOutput out = new OutputStreamDataOutput(bos); | ||
CodecUtil.writeHeader(out, targetMapCodecHeader, dictCodecVersion); | ||
|
||
final int numSourceIds = lastSourceId + 1; | ||
out.writeVInt(targetMapEndOffset); // <-- size of main array | ||
out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry) | ||
int prev = 0, sourceId = 0; | ||
for (int ofs = 0; ofs < targetMapEndOffset; ofs++) { | ||
final int val = targetMap[ofs], delta = val - prev; | ||
assert delta >= 0; | ||
if (ofs == targetMapOffsets[sourceId]) { | ||
out.writeVInt((delta << 1) | 0x01); | ||
sourceId++; | ||
} else { | ||
out.writeVInt((delta << 1)); | ||
} | ||
prev += delta; | ||
} | ||
if (sourceId != numSourceIds) { | ||
throw new IllegalStateException( | ||
"sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); | ||
} | ||
} | ||
} | ||
} |
73 changes: 73 additions & 0 deletions
73
lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.morph; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import org.apache.lucene.codecs.CodecUtil; | ||
import org.apache.lucene.store.DataInput; | ||
import org.apache.lucene.store.InputStreamDataInput; | ||
import org.apache.lucene.util.IOSupplier; | ||
|
||
/** Character category data. */ | ||
public abstract class CharacterDefinition { | ||
|
||
public static final String FILENAME_SUFFIX = ".dat"; | ||
|
||
protected final byte[] characterCategoryMap = new byte[0x10000]; | ||
private final boolean[] invokeMap; | ||
private final boolean[] groupMap; | ||
|
||
protected CharacterDefinition( | ||
IOSupplier<InputStream> charDefResource, | ||
String charDefCodecHeader, | ||
int charDefCodecVersion, | ||
int classCount) | ||
throws IOException { | ||
try (InputStream is = charDefResource.get()) { | ||
final DataInput in = new InputStreamDataInput(is); | ||
CodecUtil.checkHeader(in, charDefCodecHeader, charDefCodecVersion, charDefCodecVersion); | ||
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length); | ||
this.invokeMap = new boolean[classCount]; | ||
this.groupMap = new boolean[classCount]; | ||
for (int i = 0; i < classCount; i++) { | ||
final byte b = in.readByte(); | ||
invokeMap[i] = (b & 0x01) != 0; | ||
groupMap[i] = (b & 0x02) != 0; | ||
} | ||
} | ||
} | ||
|
||
public byte getCharacterClass(char c) { | ||
return characterCategoryMap[c]; | ||
} | ||
|
||
public boolean isInvoke(char c) { | ||
return invokeMap[characterCategoryMap[c]]; | ||
} | ||
|
||
public boolean isGroup(char c) { | ||
return groupMap[characterCategoryMap[c]]; | ||
} | ||
|
||
/** Functional interface to lookup character class */ | ||
@FunctionalInterface | ||
public interface LookupCharacterClass { | ||
/** looks up character class for given class name */ | ||
byte lookupCharacterClass(String characterClassName); | ||
} | ||
} |
Oops, something went wrong.