From bd22f199dec5ab767da7481f1ea9c1cd0195bb9f Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Fri, 25 Mar 2022 18:44:36 +0900 Subject: [PATCH] LUCENE-10393: Unify binary dictionary and dictionary writer in kuromoji and nori (#740) --- lucene/CHANGES.txt | 3 + .../analysis/common/src/java/module-info.java | 1 + .../analysis/morph/BinaryDictionary.java | 100 ++++++ .../morph/BinaryDictionaryWriter.java | 148 ++++++++ .../analysis/morph/CharacterDefinition.java | 73 ++++ .../morph}/CharacterDefinitionWriter.java | 52 ++- .../analysis/morph/ConnectionCosts.java | 66 ++++ .../morph}/ConnectionCostsWriter.java | 23 +- .../lucene/analysis/morph/Dictionary.java | 49 +++ .../analysis/morph/DictionaryEntryWriter.java | 79 +++++ .../lucene/analysis/morph/MorphData.java | 41 +++ .../lucene/analysis/morph/package-info.java | 19 + .../lucene/analysis/ja/GraphvizFormatter.java | 5 +- .../lucene/analysis/ja/JapaneseTokenizer.java | 55 +-- .../org/apache/lucene/analysis/ja/Token.java | 22 +- .../analysis/ja/dict/CharacterDefinition.java | 43 +-- .../analysis/ja/dict/ConnectionCosts.java | 40 +-- .../analysis/ja/dict/DictionaryConstants.java | 33 ++ .../{Dictionary.java => JaMorphData.java} | 50 +-- .../analysis/ja/dict/TokenInfoDictionary.java | 18 +- ...ictionary.java => TokenInfoMorphData.java} | 133 ++----- .../analysis/ja/dict/UnknownDictionary.java | 43 ++- .../analysis/ja/dict/UnknownMorphData.java | 44 +++ .../analysis/ja/dict/UserDictionary.java | 96 +---- .../analysis/ja/dict/UserMorphData.java | 110 ++++++ .../ja/util/BinaryDictionaryWriter.java | 334 ------------------ .../ja/util/CharacterDefinitionWriter.java | 83 ----- .../ja/util/ConnectionCostsBuilder.java | 7 +- .../analysis/ja/util/DictionaryBuilder.java | 4 +- .../util/TokenInfoDictionaryEntryWriter.java | 237 +++++++++++++ .../ja/util/TokenInfoDictionaryWriter.java | 18 +- .../ja/util/UnknownDictionaryWriter.java | 27 +- .../ja/dict/TestExternalDictionary.java | 6 +- .../ja/dict/TestTokenInfoDictionary.java | 19 +- .../analysis/ja/dict/TestUserDictionary.java | 11 +- .../lucene/analysis/ko/DecompoundToken.java | 4 +- .../lucene/analysis/ko/DictionaryToken.java | 24 +- .../lucene/analysis/ko/GraphvizFormatter.java | 5 +- .../lucene/analysis/ko/KoreanTokenizer.java | 43 ++- .../org/apache/lucene/analysis/ko/Token.java | 9 +- .../analysis/ko/dict/BinaryDictionary.java | 220 ------------ .../analysis/ko/dict/CharacterDefinition.java | 43 +-- .../analysis/ko/dict/ConnectionCosts.java | 40 +-- .../analysis/ko/dict/DictionaryConstants.java | 33 ++ .../{Dictionary.java => KoMorphData.java} | 46 ++- .../analysis/ko/dict/TokenInfoDictionary.java | 17 +- .../analysis/ko/dict/TokenInfoMorphData.java | 155 ++++++++ .../analysis/ko/dict/UnknownDictionary.java | 32 +- .../analysis/ko/dict/UnknownMorphData.java | 39 ++ .../analysis/ko/dict/UserDictionary.java | 89 +---- .../analysis/ko/dict/UserMorphData.java | 90 +++++ .../PartOfSpeechAttribute.java | 9 +- .../PartOfSpeechAttributeImpl.java | 8 +- .../ko/util/ConnectionCostsBuilder.java | 7 +- .../ko/util/ConnectionCostsWriter.java | 68 ---- .../analysis/ko/util/DictionaryBuilder.java | 4 +- ...va => TokenInfoDictionaryEntryWriter.java} | 156 ++------ .../ko/util/TokenInfoDictionaryWriter.java | 18 +- .../ko/util/UnknownDictionaryWriter.java | 27 +- .../ko/dict/TestExternalDictionary.java | 6 +- .../ko/dict/TestTokenInfoDictionary.java | 19 +- .../analysis/ko/dict/TestUserDictionary.java | 10 +- 62 files changed, 1821 insertions(+), 1492 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java rename lucene/analysis/{nori/src/java/org/apache/lucene/analysis/ko/util => common/src/java/org/apache/lucene/analysis/morph}/CharacterDefinitionWriter.java (57%) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCosts.java rename lucene/analysis/{kuromoji/src/java/org/apache/lucene/analysis/ja/util => common/src/java/org/apache/lucene/analysis/morph}/ConnectionCostsWriter.java (77%) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Dictionary.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/DictionaryEntryWriter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphData.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/package-info.java create mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryConstants.java rename lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/{Dictionary.java => JaMorphData.java} (60%) rename lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/{BinaryDictionary.java => TokenInfoMorphData.java} (56%) create mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownMorphData.java create mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java delete mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java delete mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java create mode 100644 lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryEntryWriter.java delete mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java create mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/DictionaryConstants.java rename lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/{Dictionary.java => KoMorphData.java} (52%) create mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoMorphData.java create mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownMorphData.java create mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java delete mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java rename lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/{BinaryDictionaryWriter.java => TokenInfoDictionaryEntryWriter.java} (50%) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 435434aaf30b..cda9ad881cc9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -49,6 +49,9 @@ Other * LUCENE-10253: The @BadApple annotation has been removed from the test framework. (Adrien Grand) +* LUCENE-10393: Unify binary dictionary and dictionary writer in Kuromoji and Nori. + (Tomoko Uchida, Robert Muir) + ======================= Lucene 9.2.0 ======================= API Changes --------------------- diff --git a/lucene/analysis/common/src/java/module-info.java b/lucene/analysis/common/src/java/module-info.java index ec837952e7b5..f8e7bf085546 100644 --- a/lucene/analysis/common/src/java/module-info.java +++ b/lucene/analysis/common/src/java/module-info.java @@ -60,6 +60,7 @@ exports org.apache.lucene.analysis.lv; exports org.apache.lucene.analysis.minhash; exports org.apache.lucene.analysis.miscellaneous; + exports org.apache.lucene.analysis.morph; exports org.apache.lucene.analysis.ne; exports org.apache.lucene.analysis.ngram; exports org.apache.lucene.analysis.nl; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java new file mode 100644 index 000000000000..f72ed913d351 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionary.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.BufferedInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; +import org.apache.lucene.util.IntsRef; + +/** Abstract dictionary base class. */ +public abstract class BinaryDictionary implements Dictionary { + public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; + public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; + public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; + + private final int[] targetMapOffsets, targetMap; + protected final ByteBuffer buffer; + + protected BinaryDictionary( + IOSupplier targetMapResource, + IOSupplier dictResource, + String targetMapCodecHeader, + String dictCodecHeader, + int dictCodecVersion) + throws IOException { + try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) { + final DataInput in = new InputStreamDataInput(mapIS); + CodecUtil.checkHeader(in, targetMapCodecHeader, dictCodecVersion, dictCodecVersion); + this.targetMap = new int[in.readVInt()]; + this.targetMapOffsets = new int[in.readVInt()]; + populateTargetMap(in, this.targetMap, this.targetMapOffsets); + } + + // no buffering here, as we load in one large buffer + try (InputStream dictIS = dictResource.get()) { + final DataInput in = new InputStreamDataInput(dictIS); + CodecUtil.checkHeader(in, dictCodecHeader, dictCodecVersion, dictCodecVersion); + final int size = in.readVInt(); + final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); + final ReadableByteChannel channel = Channels.newChannel(dictIS); + final int read = channel.read(tmpBuffer); + if (read != size) { + throw new EOFException("Cannot read whole dictionary"); + } + this.buffer = tmpBuffer.asReadOnlyBuffer(); + } + } + + private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets) + throws IOException { + int accum = 0, sourceId = 0; + for (int ofs = 0; ofs < targetMap.length; ofs++) { + final int val = in.readVInt(); + if ((val & 0x01) != 0) { + targetMapOffsets[sourceId] = ofs; + sourceId++; + } + accum += val >>> 1; + targetMap[ofs] = accum; + } + if (sourceId + 1 != targetMapOffsets.length) + throw new IOException( + "targetMap file format broken; targetMap.length=" + + targetMap.length + + ", targetMapOffsets.length=" + + targetMapOffsets.length + + ", sourceId=" + + sourceId); + targetMapOffsets[sourceId] = targetMap.length; + } + + public void lookupWordIds(int sourceId, IntsRef ref) { + ref.ints = targetMap; + ref.offset = targetMapOffsets[sourceId]; + // targetMapOffsets always has one more entry pointing behind last: + ref.length = targetMapOffsets[sourceId + 1] - ref.offset; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java new file mode 100644 index 000000000000..bb0a5c096b31 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/BinaryDictionaryWriter.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.ArrayUtil; + +/** Abstract base dictionary writer class. */ +public abstract class BinaryDictionaryWriter> { + private final Class implClazz; + private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; + private int[] targetMap = new int[8192]; + private int[] targetMapOffsets = new int[8192]; + protected final DictionaryEntryWriter entryWriter; + + protected BinaryDictionaryWriter(Class implClazz, DictionaryEntryWriter entryWriter) { + this.implClazz = implClazz; + this.entryWriter = entryWriter; + } + + /** + * put the entry in map + * + * @return current position of buffer, which will be wordId of next entry + */ + public int put(String[] entry) { + return entryWriter.putEntry(entry); + } + + /** + * Write whole dictionary in a directory. + * + * @throws IOException if an I/O error occurs writing the dictionary files + */ + public abstract void write(Path baseDir) throws IOException; + + protected void addMapping(int sourceId, int wordId) { + if (wordId <= lastWordId) { + throw new IllegalStateException( + "words out of order: " + wordId + " vs lastID: " + lastWordId); + } + + if (sourceId > lastSourceId) { + targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1); + for (int i = lastSourceId + 1; i <= sourceId; i++) { + targetMapOffsets[i] = targetMapEndOffset; + } + } else if (sourceId != lastSourceId) { + throw new IllegalStateException( + "source ids not in increasing order: lastSourceId=" + + lastSourceId + + " vs sourceId=" + + sourceId); + } + + targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1); + targetMap[targetMapEndOffset] = wordId; + targetMapEndOffset++; + + lastSourceId = sourceId; + lastWordId = wordId; + } + + /** + * Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left + * id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], + * [entry...], [entry...]..... + * + * @throws IOException if an I/O error occurs writing the dictionary files + */ + protected void write( + Path baseDir, + String targetMapCodecHeader, + String posDictCodecHeader, + String dictCodecHeader, + int dictCodecVersion) + throws IOException { + final String baseName = getBaseFileName(); + entryWriter.writeDictionary( + baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX), + dictCodecHeader, + dictCodecVersion); + entryWriter.writePosDict( + baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX), + posDictCodecHeader, + dictCodecVersion); + writeTargetMap( + baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX), + targetMapCodecHeader, + dictCodecVersion); + } + + protected final String getBaseFileName() { + return implClazz.getName().replace('.', '/'); + } + + // TODO: maybe this int[] should instead be the output to the FST... + private void writeTargetMap(Path path, String targetMapCodecHeader, int dictCodecVersion) + throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); + CodecUtil.writeHeader(out, targetMapCodecHeader, dictCodecVersion); + + final int numSourceIds = lastSourceId + 1; + out.writeVInt(targetMapEndOffset); // <-- size of main array + out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry) + int prev = 0, sourceId = 0; + for (int ofs = 0; ofs < targetMapEndOffset; ofs++) { + final int val = targetMap[ofs], delta = val - prev; + assert delta >= 0; + if (ofs == targetMapOffsets[sourceId]) { + out.writeVInt((delta << 1) | 0x01); + sourceId++; + } else { + out.writeVInt((delta << 1)); + } + prev += delta; + } + if (sourceId != numSourceIds) { + throw new IllegalStateException( + "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); + } + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java new file mode 100644 index 000000000000..29c1b0122132 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinition.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; + +/** Character category data. */ +public abstract class CharacterDefinition { + + public static final String FILENAME_SUFFIX = ".dat"; + + protected final byte[] characterCategoryMap = new byte[0x10000]; + private final boolean[] invokeMap; + private final boolean[] groupMap; + + protected CharacterDefinition( + IOSupplier charDefResource, + String charDefCodecHeader, + int charDefCodecVersion, + int classCount) + throws IOException { + try (InputStream is = charDefResource.get()) { + final DataInput in = new InputStreamDataInput(is); + CodecUtil.checkHeader(in, charDefCodecHeader, charDefCodecVersion, charDefCodecVersion); + in.readBytes(characterCategoryMap, 0, characterCategoryMap.length); + this.invokeMap = new boolean[classCount]; + this.groupMap = new boolean[classCount]; + for (int i = 0; i < classCount; i++) { + final byte b = in.readByte(); + invokeMap[i] = (b & 0x01) != 0; + groupMap[i] = (b & 0x02) != 0; + } + } + } + + public byte getCharacterClass(char c) { + return characterCategoryMap[c]; + } + + public boolean isInvoke(char c) { + return invokeMap[characterCategoryMap[c]]; + } + + public boolean isGroup(char c) { + return groupMap[characterCategoryMap[c]]; + } + + /** Functional interface to lookup character class */ + @FunctionalInterface + public interface LookupCharacterClass { + /** looks up character class for given class name */ + byte lookupCharacterClass(String characterClassName); + } +} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinitionWriter.java similarity index 57% rename from lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinitionWriter.java index cbd3f7686857..1f4fc3d13a91 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/CharacterDefinitionWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ko.util; +package org.apache.lucene.analysis.morph; import java.io.BufferedOutputStream; import java.io.IOException; @@ -22,21 +22,33 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; -import org.apache.lucene.analysis.ko.dict.CharacterDefinition; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.OutputStreamDataOutput; -final class CharacterDefinitionWriter { +/** Writes character definition file */ +public final class CharacterDefinitionWriter { - private final byte[] characterCategoryMap = new byte[0x10000]; + private final Class implClazz; - private final boolean[] invokeMap = new boolean[CharacterDefinition.CLASS_COUNT]; - private final boolean[] groupMap = new boolean[CharacterDefinition.CLASS_COUNT]; + private final byte[] characterCategoryMap = new byte[0x10000]; + private final int classCount; + private final boolean[] invokeMap; + private final boolean[] groupMap; + private final CharacterDefinition.LookupCharacterClass lookupCharClass; /** Constructor for building. TODO: remove write access */ - CharacterDefinitionWriter() { - Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT); + public CharacterDefinitionWriter( + Class implClazz, + byte defaultValue, + int classCount, + CharacterDefinition.LookupCharacterClass lookupCharClass) { + this.implClazz = implClazz; + Arrays.fill(characterCategoryMap, defaultValue); + this.invokeMap = new boolean[classCount]; + this.groupMap = new boolean[classCount]; + this.classCount = classCount; + this.lookupCharClass = lookupCharClass; } /** @@ -45,7 +57,7 @@ final class CharacterDefinitionWriter { * @param codePoint code point * @param characterClassName character class name */ - void putCharacterCategory(int codePoint, String characterClassName) { + public void putCharacterCategory(int codePoint, String characterClassName) { characterClassName = characterClassName.split(" ")[0]; // use first // category // class @@ -54,27 +66,29 @@ void putCharacterCategory(int codePoint, String characterClassName) { if (codePoint == 0x30FB) { characterClassName = "SYMBOL"; } - characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName); + characterCategoryMap[codePoint] = lookupCharClass.lookupCharacterClass(characterClassName); } - void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { - final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName); + public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { + final byte characterClass = lookupCharClass.lookupCharacterClass(characterClassName); invokeMap[characterClass] = invoke == 1; groupMap[characterClass] = group == 1; // TODO: length def ignored } - public void write(Path baseDir) throws IOException { - Path path = - baseDir.resolve( - CharacterDefinition.class.getName().replace('.', '/') - + CharacterDefinition.FILENAME_SUFFIX); + private String getBaseFileName() { + return implClazz.getName().replace('.', '/'); + } + + public void write(Path baseDir, String charDefCodecHeader, int charDefCodecVersion) + throws IOException { + Path path = baseDir.resolve(getBaseFileName() + CharacterDefinition.FILENAME_SUFFIX); Files.createDirectories(path.getParent()); try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { final DataOutput out = new OutputStreamDataOutput(os); - CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION); + CodecUtil.writeHeader(out, charDefCodecHeader, charDefCodecVersion); out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length); - for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++) { + for (int i = 0; i < classCount; i++) { final byte b = (byte) ((invokeMap[i] ? 0x01 : 0x00) | (groupMap[i] ? 0x02 : 0x00)); out.writeByte(b); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCosts.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCosts.java new file mode 100644 index 000000000000..70c581611965 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCosts.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; + +/** n-gram connection cost data */ +public abstract class ConnectionCosts { + + public static final String FILENAME_SUFFIX = ".dat"; + + private final ByteBuffer buffer; + private final int forwardSize; + + protected ConnectionCosts( + IOSupplier connectionCostResource, + String connectionCostsCodecHeader, + int dictCodecVersion) + throws IOException { + try (InputStream is = new BufferedInputStream(connectionCostResource.get())) { + final DataInput in = new InputStreamDataInput(is); + CodecUtil.checkHeader(in, connectionCostsCodecHeader, dictCodecVersion, dictCodecVersion); + forwardSize = in.readVInt(); + int backwardSize = in.readVInt(); + int size = forwardSize * backwardSize; + + // copy the matrix into a direct byte buffer + final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size * 2); + int accum = 0; + for (int j = 0; j < backwardSize; j++) { + for (int i = 0; i < forwardSize; i++) { + accum += in.readZInt(); + tmpBuffer.putShort((short) accum); + } + } + buffer = tmpBuffer.asReadOnlyBuffer(); + } + } + + public int get(int forwardId, int backwardId) { + // map 2d matrix into a single dimension short array + int offset = (backwardId * forwardSize + forwardId) * 2; + return buffer.getShort(offset); + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCostsWriter.java similarity index 77% rename from lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCostsWriter.java index 8d081f77bfcd..f4f0a51c8a8e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/ConnectionCostsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ja.util; +package org.apache.lucene.analysis.morph; import java.io.BufferedOutputStream; import java.io.IOException; @@ -22,20 +22,23 @@ import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.lucene.analysis.ja.dict.ConnectionCosts; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.OutputStreamDataOutput; -final class ConnectionCostsWriter { +/** Writes connection costs */ +public final class ConnectionCostsWriter { + private final Class implClazz; private final ByteBuffer costs; // array is backward IDs first since get is called using the same backward ID // consecutively. maybe doesn't matter. private final int forwardSize; private final int backwardSize; + /** Constructor for building. TODO: remove write access */ - ConnectionCostsWriter(int forwardSize, int backwardSize) { + public ConnectionCostsWriter(Class implClazz, int forwardSize, int backwardSize) { + this.implClazz = implClazz; this.forwardSize = forwardSize; this.backwardSize = backwardSize; this.costs = ByteBuffer.allocateDirect(2 * backwardSize * forwardSize); @@ -46,14 +49,18 @@ public void add(int forwardId, int backwardId, int cost) { costs.putShort(offset, (short) cost); } - public void write(Path baseDir) throws IOException { + private String getBaseFileName() { + return implClazz.getName().replace('.', '/'); + } + + public void write(Path baseDir, String connectionCostsCodecHeader, int dictCodecVersion) + throws IOException { Files.createDirectories(baseDir); - String fileName = - ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX; + String fileName = getBaseFileName() + ConnectionCosts.FILENAME_SUFFIX; try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName)); OutputStream bos = new BufferedOutputStream(os)) { final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION); + CodecUtil.writeHeader(out, connectionCostsCodecHeader, dictCodecVersion); out.writeVInt(forwardSize); out.writeVInt(backwardSize); int last = 0; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Dictionary.java new file mode 100644 index 000000000000..5ff714c63f7b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Dictionary.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +/** High-level dictionary interface for morphological analyzers. */ +public interface Dictionary { + /** + * Get left id of specified word + * + * @return left id + */ + default int getLeftId(int morphId) { + return getMorphAttributes().getLeftId(morphId); + } + + /** + * Get right id of specified word + * + * @return right id + */ + default int getRightId(int morphId) { + return getMorphAttributes().getRightId(morphId); + } + + /** + * Get word cost of specified word + * + * @return word's cost + */ + default int getWordCost(int morphId) { + return getMorphAttributes().getWordCost(morphId); + } + + T getMorphAttributes(); +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/DictionaryEntryWriter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/DictionaryEntryWriter.java new file mode 100644 index 000000000000..76fb44ea603b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/DictionaryEntryWriter.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; +import java.nio.channels.WritableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.OutputStreamDataOutput; + +/** Abstract writer class to write dictionary entries. */ +public abstract class DictionaryEntryWriter { + + protected ByteBuffer buffer; + protected final List posDict; + + protected DictionaryEntryWriter(int size) { + this.buffer = ByteBuffer.allocate(size); + this.posDict = new ArrayList<>(); + } + + /** Writes an entry. */ + protected abstract int putEntry(String[] entry); + + /** Flush POS dictionary data. */ + protected abstract void writePosDict(OutputStream bos, DataOutput out) throws IOException; + + void writePosDict(Path path, String posDictCodecHeader, int dictCodecVersion) throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); + CodecUtil.writeHeader(out, posDictCodecHeader, dictCodecVersion); + writePosDict(bos, out); + } + } + + void writeDictionary(Path path, String dictCodecHeader, int dictCodecVersion) throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); + CodecUtil.writeHeader(out, dictCodecHeader, dictCodecVersion); + out.writeVInt(buffer.position()); + final WritableByteChannel channel = Channels.newChannel(bos); + // Write Buffer + buffer.flip(); // set position to 0, set limit to current position + channel.write(buffer); + assert buffer.remaining() == 0L; + } + } + + /** Returns current word id. */ + public int currentPosition() { + return buffer.position(); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphData.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphData.java new file mode 100644 index 000000000000..1cafe9af71aa --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphData.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +/** High-level interface that represents morphological information in a dictionary */ +public interface MorphData { + /** + * Get left id of specified word + * + * @return left id + */ + int getLeftId(int morphId); + + /** + * Get right id of specified word + * + * @return right id + */ + int getRightId(int morphId); + + /** + * Get word cost of specified word + * + * @return word's cost + */ + int getWordCost(int morphId); +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/package-info.java new file mode 100644 index 000000000000..7e53020588f7 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Abstract classes for morphological analyzers. */ +package org.apache.lucene.analysis.morph; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java index de3748b26664..4538daa362f6 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java @@ -21,7 +21,8 @@ import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position; import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray; import org.apache.lucene.analysis.ja.dict.ConnectionCosts; -import org.apache.lucene.analysis.ja.dict.Dictionary; +import org.apache.lucene.analysis.ja.dict.JaMorphData; +import org.apache.lucene.analysis.morph.Dictionary; // TODO: would be nice to show 2nd best path in a diff't // color... @@ -140,7 +141,7 @@ private String formatNodes( attrs = ""; } - final Dictionary dict = tok.getDict(posData.backType[idx]); + final Dictionary dict = tok.getDict(posData.backType[idx]); final int wordCost = dict.getWordCost(posData.backID[idx]); final int bgCost = costs.get( diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java index 47cb8d19297f..a933525b2f04 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java @@ -28,12 +28,13 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.dict.CharacterDefinition; import org.apache.lucene.analysis.ja.dict.ConnectionCosts; -import org.apache.lucene.analysis.ja.dict.Dictionary; +import org.apache.lucene.analysis.ja.dict.JaMorphData; import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary; import org.apache.lucene.analysis.ja.dict.TokenInfoFST; import org.apache.lucene.analysis.ja.dict.UnknownDictionary; import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.apache.lucene.analysis.ja.tokenattributes.*; +import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -116,7 +117,8 @@ public enum Type { private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; private static final int MAX_BACKTRACE_GAP = 1024; - private final EnumMap dictionaryMap = new EnumMap<>(Type.class); + private final EnumMap> dictionaryMap = + new EnumMap<>(Type.class); private final TokenInfoFST fst; private final TokenInfoDictionary dictionary; @@ -494,10 +496,15 @@ public void reset() { } private void add( - Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) + JaMorphData morphAtts, + Position fromPosData, + int endPos, + int wordID, + Type type, + boolean addPenalty) throws IOException { - final int wordCost = dict.getWordCost(wordID); - final int leftID = dict.getLeftId(wordID); + final int wordCost = morphAtts.getWordCost(wordID); + final int leftID = morphAtts.getLeftId(wordID); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; assert fromPosData.count > 0; @@ -560,7 +567,7 @@ private void add( // positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, // wordID, type); - assert leftID == dict.getRightId(wordID); + assert leftID == morphAtts.getRightId(wordID); positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type); } @@ -895,7 +902,7 @@ private void parse() throws IOException { + (posAhead + 1)); } add( - userDictionary, + userDictionary.getMorphAttributes(), posData, posAhead + 1, output + arc.nextFinalOutput().intValue(), @@ -948,7 +955,7 @@ private void parse() throws IOException { } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( - dictionary, + dictionary.getMorphAttributes(), posData, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], @@ -1004,7 +1011,7 @@ && isPunctuation((char) ch) == isPunct) { } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( - unkDictionary, + unkDictionary.getMorphAttributes(), posData, posData.pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], @@ -1126,7 +1133,7 @@ private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws final int pathCost = posData.costs[bestStartIDX]; for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) { final Type forwardType = posData.forwardType[forwardArcIDX]; - final Dictionary dict2 = getDict(forwardType); + final Dictionary dict2 = getDict(forwardType); final int wordID = posData.forwardID[forwardArcIDX]; final int toPos = posData.forwardPos[forwardArcIDX]; final int newCost = @@ -1169,7 +1176,7 @@ private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws + toPos); } add( - getDict(forwardType), + getDict(forwardType).getMorphAttributes(), posData, toPos, posData.forwardID[forwardArcIDX], @@ -1184,7 +1191,7 @@ private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws // yet another lattice data structure private static final class Lattice { char[] fragment; - EnumMap dictionaryMap; + EnumMap> dictionaryMap; boolean useEOS; int rootCapacity = 0; @@ -1296,7 +1303,7 @@ private int addNode(Type dicType, int wordID, int left, int right) { nodeLeftID[node] = 0; nodeRightID[node] = 0; } else { - Dictionary dic = dictionaryMap.get(dicType); + Dictionary dic = dictionaryMap.get(dicType); nodeWordCost[node] = dic.getWordCost(wordID); nodeLeftID[node] = dic.getLeftId(wordID); nodeRightID[node] = dic.getRightId(wordID); @@ -1338,7 +1345,7 @@ private int positionCount(WrappedPositionArray positions, int beg, int end) { void setup( char[] fragment, - EnumMap dictionaryMap, + EnumMap> dictionaryMap, WrappedPositionArray positions, int prevOffset, int endOffset, @@ -1565,7 +1572,7 @@ private void registerNode(int node, char[] fragment) { right - left, Type.USER, lattice.rootBase + left, - userDictionary)); + userDictionary.getMorphAttributes())); // Output compound int current = 0; for (int j = 1; j < wordIDAndLength.length; j++) { @@ -1579,7 +1586,7 @@ private void registerNode(int node, char[] fragment) { len, Type.USER, lattice.rootBase + current + left, - userDictionary)); + userDictionary.getMorphAttributes())); } current += len; } @@ -1592,7 +1599,7 @@ private void registerNode(int node, char[] fragment) { right - left, type, lattice.rootBase + left, - getDict(type))); + getDict(type).getMorphAttributes())); } } } @@ -1921,7 +1928,7 @@ private void backtrace(final Position endPosData, final int fromIDX) throws IOEx length, backType, backPos, - getDict(backType)); + getDict(backType).getMorphAttributes()); // Redirect our backtrace to 2nd best: bestIDX = leastIDX; @@ -1980,7 +1987,7 @@ private void backtrace(final Position endPosData, final int fromIDX) throws IOEx altToken = null; } - final Dictionary dict = getDict(backType); + final Dictionary dict = getDict(backType); if (backType == Type.USER) { @@ -2000,7 +2007,7 @@ private void backtrace(final Position endPosData, final int fromIDX) throws IOEx len, Type.USER, current + backPos, - dict)); + dict.getMorphAttributes())); if (VERBOSE) { System.out.println(" add USER token=" + pending.get(pending.size() - 1)); } @@ -2037,14 +2044,16 @@ private void backtrace(final Position endPosData, final int fromIDX) throws IOEx charLen, Type.UNKNOWN, backPos + i, - unkDictionary)); + unkDictionary.getMorphAttributes())); unigramTokenCount++; } } backCount += unigramTokenCount; } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { - pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict)); + pending.add( + new Token( + backID, fragment, offset, length, backType, backPos, dict.getMorphAttributes())); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } @@ -2073,7 +2082,7 @@ private void backtrace(final Position endPosData, final int fromIDX) throws IOEx positions.freeBefore(endPos); } - Dictionary getDict(Type type) { + Dictionary getDict(Type type) { return dictionaryMap.get(type); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java index 5a10b50817b4..e9ae9490125e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java @@ -17,11 +17,11 @@ package org.apache.lucene.analysis.ja; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Type; -import org.apache.lucene.analysis.ja.dict.Dictionary; +import org.apache.lucene.analysis.ja.dict.JaMorphData; /** Analyzed token with morphological data from its dictionary. */ public class Token { - private final Dictionary dictionary; + private final JaMorphData morphAtts; private final int wordId; @@ -41,14 +41,14 @@ public Token( int length, Type type, int position, - Dictionary dictionary) { + JaMorphData morphAtts) { this.wordId = wordId; this.surfaceForm = surfaceForm; this.offset = offset; this.length = length; this.type = type; this.position = position; - this.dictionary = dictionary; + this.morphAtts = morphAtts; } @Override @@ -66,7 +66,7 @@ public String toString() { + " wordId=" + wordId + " leftID=" - + dictionary.getLeftId(wordId) + + morphAtts.getLeftId(wordId) + ")"; } @@ -92,32 +92,32 @@ public String getSurfaceFormString() { /** @return reading. null if token doesn't have reading. */ public String getReading() { - return dictionary.getReading(wordId, surfaceForm, offset, length); + return morphAtts.getReading(wordId, surfaceForm, offset, length); } /** @return pronunciation. null if token doesn't have pronunciation. */ public String getPronunciation() { - return dictionary.getPronunciation(wordId, surfaceForm, offset, length); + return morphAtts.getPronunciation(wordId, surfaceForm, offset, length); } /** @return part of speech. */ public String getPartOfSpeech() { - return dictionary.getPartOfSpeech(wordId); + return morphAtts.getPartOfSpeech(wordId); } /** @return inflection type or null */ public String getInflectionType() { - return dictionary.getInflectionType(wordId); + return morphAtts.getInflectionType(wordId); } /** @return inflection form or null */ public String getInflectionForm() { - return dictionary.getInflectionForm(wordId); + return morphAtts.getInflectionForm(wordId); } /** @return base form or null if token is not inflected */ public String getBaseForm() { - return dictionary.getBaseForm(wordId, surfaceForm, offset, length); + return morphAtts.getBaseForm(wordId, surfaceForm, offset, length); } /** diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CharacterDefinition.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CharacterDefinition.java index 36bb825bc2d2..be29ebbe3674 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CharacterDefinition.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CharacterDefinition.java @@ -16,20 +16,13 @@ */ package org.apache.lucene.analysis.ja.dict; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOUtils; /** Character category data. */ -public final class CharacterDefinition { - - public static final String FILENAME_SUFFIX = ".dat"; - public static final String HEADER = "kuromoji_cd"; - public static final int VERSION = 1; +public final class CharacterDefinition + extends org.apache.lucene.analysis.morph.CharacterDefinition { public static final int CLASS_COUNT = CharacterClass.values().length; @@ -49,11 +42,6 @@ private enum CharacterClass { KANJINUMERIC; } - private final byte[] characterCategoryMap = new byte[0x10000]; - - private final boolean[] invokeMap = new boolean[CLASS_COUNT]; - private final boolean[] groupMap = new boolean[CLASS_COUNT]; - // the classes: public static final byte NGRAM = (byte) CharacterClass.NGRAM.ordinal(); public static final byte DEFAULT = (byte) CharacterClass.DEFAULT.ordinal(); @@ -69,16 +57,11 @@ private enum CharacterClass { public static final byte KANJINUMERIC = (byte) CharacterClass.KANJINUMERIC.ordinal(); private CharacterDefinition() throws IOException { - try (InputStream is = new BufferedInputStream(getClassResource())) { - final DataInput in = new InputStreamDataInput(is); - CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); - in.readBytes(characterCategoryMap, 0, characterCategoryMap.length); - for (int i = 0; i < CLASS_COUNT; i++) { - final byte b = in.readByte(); - invokeMap[i] = (b & 0x01) != 0; - groupMap[i] = (b & 0x02) != 0; - } - } + super( + CharacterDefinition::getClassResource, + DictionaryConstants.CHARDEF_HEADER, + DictionaryConstants.VERSION, + CharacterClass.values().length); } private static InputStream getClassResource() throws IOException { @@ -87,18 +70,6 @@ private static InputStream getClassResource() throws IOException { CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath); } - public byte getCharacterClass(char c) { - return characterCategoryMap[c]; - } - - public boolean isInvoke(char c) { - return invokeMap[characterCategoryMap[c]]; - } - - public boolean isGroup(char c) { - return groupMap[characterCategoryMap[c]]; - } - public boolean isKanji(char c) { final byte characterClass = characterCategoryMap[c]; return characterClass == KANJI || characterClass == KANJINUMERIC; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java index dc5fabe6c544..c11b9ee716a7 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java @@ -16,27 +16,15 @@ */ package org.apache.lucene.analysis.ja.dict; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; /** n-gram connection cost data */ -public final class ConnectionCosts { - - public static final String FILENAME_SUFFIX = ".dat"; - public static final String HEADER = "kuromoji_cc"; - public static final int VERSION = 1; - - private final ByteBuffer buffer; - private final int forwardSize; +public final class ConnectionCosts extends org.apache.lucene.analysis.morph.ConnectionCosts { /** * Create a {@link ConnectionCosts} from an external resource path. @@ -53,24 +41,8 @@ private ConnectionCosts() throws IOException { } private ConnectionCosts(IOSupplier connectionCostResource) throws IOException { - try (InputStream is = new BufferedInputStream(connectionCostResource.get())) { - final DataInput in = new InputStreamDataInput(is); - CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); - forwardSize = in.readVInt(); - int backwardSize = in.readVInt(); - int size = forwardSize * backwardSize; - - // copy the matrix into a direct byte buffer - final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size * 2); - int accum = 0; - for (int j = 0; j < backwardSize; j++) { - for (int i = 0; i < forwardSize; i++) { - accum += in.readZInt(); - tmpBuffer.putShort((short) accum); - } - } - buffer = tmpBuffer.asReadOnlyBuffer(); - } + super( + connectionCostResource, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION); } private static InputStream getClassResource() throws IOException { @@ -79,12 +51,6 @@ private static InputStream getClassResource() throws IOException { ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath); } - public int get(int forwardId, int backwardId) { - // map 2d matrix into a single dimension short array - int offset = (backwardId * forwardSize + forwardId) * 2; - return buffer.getShort(offset); - } - public static ConnectionCosts getInstance() { return SingletonHolder.INSTANCE; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryConstants.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryConstants.java new file mode 100644 index 000000000000..3046beef3e63 --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryConstants.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja.dict; + +/** Dictionary constants */ +public final class DictionaryConstants { + /** Codec header of the dictionary file. */ + public static final String DICT_HEADER = "kuromoji_dict"; + /** Codec header of the dictionary mapping file. */ + public static final String TARGETMAP_HEADER = "kuromoji_dict_map"; + /** Codec header of the POS dictionary file. */ + public static final String POSDICT_HEADER = "kuromoji_dict_pos"; + /** Codec header of the connection costs. */ + public static final String CONN_COSTS_HEADER = "kuromoji_cc"; + /** Codec header of the character definition file. */ + public static final String CHARDEF_HEADER = "kuromoji_cd"; + /** Codec version of the binary dictionary */ + public static final int VERSION = 1; +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/JaMorphData.java similarity index 60% rename from lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/JaMorphData.java index cfe11b30cd90..8865f6ad09f5 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/JaMorphData.java @@ -16,71 +16,49 @@ */ package org.apache.lucene.analysis.ja.dict; -/** Dictionary interface for retrieving morphological data by id. */ -public interface Dictionary { - - public static final String INTERNAL_SEPARATOR = "\u0000"; - - /** - * Get left id of specified word - * - * @return left id - */ - public int getLeftId(int wordId); - - /** - * Get right id of specified word - * - * @return right id - */ - public int getRightId(int wordId); - - /** - * Get word cost of specified word - * - * @return word's cost - */ - public int getWordCost(int wordId); +import org.apache.lucene.analysis.morph.MorphData; +/** Represents Japanese morphological information. */ +public interface JaMorphData extends MorphData { /** * Get Part-Of-Speech of tokens * - * @param wordId word ID of token + * @param morphId word ID of token * @return Part-Of-Speech of the token */ - public String getPartOfSpeech(int wordId); + String getPartOfSpeech(int morphId); /** * Get reading of tokens * - * @param wordId word ID of token + * @param morphId word ID of token * @return Reading of the token */ - public String getReading(int wordId, char[] surface, int off, int len); + String getReading(int morphId, char[] surface, int off, int len); /** * Get base form of word * - * @param wordId word ID of token + * @param morphId word ID of token * @return Base form (only different for inflected words, otherwise null) */ - public String getBaseForm(int wordId, char[] surface, int off, int len); + String getBaseForm(int morphId, char[] surface, int off, int len); /** * Get pronunciation of tokens * - * @param wordId word ID of token + * @param morphId word ID of token * @return Pronunciation of the token */ - public String getPronunciation(int wordId, char[] surface, int off, int len); + String getPronunciation(int morphId, char[] surface, int off, int len); /** * Get inflection type of tokens * - * @param wordId word ID of token + * @param morphId word ID of token * @return inflection type, or null */ - public String getInflectionType(int wordId); + String getInflectionType(int morphId); /** * Get inflection form of tokens @@ -88,7 +66,7 @@ public interface Dictionary { * @param wordId word ID of token * @return inflection form, or null */ - public String getInflectionForm(int wordId); + String getInflectionForm(int wordId); // TODO: maybe we should have a optimal method, a non-typesafe // 'getAdditionalData' if other dictionaries like unidic have additional data } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java index d9e50f86a759..c769587829b6 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.morph.BinaryDictionary; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOSupplier; @@ -32,11 +33,12 @@ * Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST * mapping to a list of wordIDs. */ -public final class TokenInfoDictionary extends BinaryDictionary { +public final class TokenInfoDictionary extends BinaryDictionary { public static final String FST_FILENAME_SUFFIX = "$fst.dat"; private final TokenInfoFST fst; + private final TokenInfoMorphData morphAtts; /** * Create a {@link TokenInfoDictionary} from an external resource path. @@ -70,7 +72,14 @@ private TokenInfoDictionary( IOSupplier dictResource, IOSupplier fstResource) throws IOException { - super(targetMapResource, posResource, dictResource); + super( + targetMapResource, + dictResource, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new TokenInfoMorphData(buffer, posResource); + FST fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { DataInput in = new InputStreamDataInput(is); @@ -86,6 +95,11 @@ private static InputStream getClassResource(String suffix) throws IOException { TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath); } + @Override + public TokenInfoMorphData getMorphAttributes() { + return morphAtts; + } + public TokenInfoFST getFST() { return fst; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoMorphData.java similarity index 56% rename from lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoMorphData.java index 9ad0d8137783..d743a7faca67 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoMorphData.java @@ -17,95 +17,37 @@ package org.apache.lucene.analysis.ja.dict; import java.io.BufferedInputStream; -import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.ReadableByteChannel; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOSupplier; -import org.apache.lucene.util.IntsRef; -/** Base class for a binary-encoded in-memory dictionary. */ -public abstract class BinaryDictionary implements Dictionary { - - public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; - public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; - public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; - - public static final String DICT_HEADER = "kuromoji_dict"; - public static final String TARGETMAP_HEADER = "kuromoji_dict_map"; - public static final String POSDICT_HEADER = "kuromoji_dict_pos"; - public static final int VERSION = 1; +/** Morphological information for system dictionary. */ +public class TokenInfoMorphData implements JaMorphData { private final ByteBuffer buffer; - private final int[] targetMapOffsets, targetMap; private final String[] posDict; private final String[] inflTypeDict; private final String[] inflFormDict; - protected BinaryDictionary( - IOSupplier targetMapResource, - IOSupplier posResource, - IOSupplier dictResource) - throws IOException { - try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) { - final DataInput in = new InputStreamDataInput(mapIS); - CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); - this.targetMap = new int[in.readVInt()]; - this.targetMapOffsets = new int[in.readVInt()]; - populateTargetMap(in, this.targetMap, this.targetMapOffsets); - } - + TokenInfoMorphData(ByteBuffer buffer, IOSupplier posResource) throws IOException { + this.buffer = buffer; try (InputStream posIS = new BufferedInputStream(posResource.get())) { final DataInput in = new InputStreamDataInput(posIS); - CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); + CodecUtil.checkHeader( + in, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.VERSION, + DictionaryConstants.VERSION); final int posSize = in.readVInt(); this.posDict = new String[posSize]; this.inflTypeDict = new String[posSize]; this.inflFormDict = new String[posSize]; - populatePosDict(in, posSize, this.posDict, this.inflTypeDict, this.inflFormDict); - } - - // no buffering here, as we load in one large buffer - try (InputStream dictIS = dictResource.get()) { - final DataInput in = new InputStreamDataInput(dictIS); - CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); - final int size = in.readVInt(); - final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); - final ReadableByteChannel channel = Channels.newChannel(dictIS); - final int read = channel.read(tmpBuffer); - if (read != size) { - throw new EOFException("Cannot read whole dictionary"); - } - this.buffer = tmpBuffer.asReadOnlyBuffer(); - } - } - - private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets) - throws IOException { - int accum = 0, sourceId = 0; - for (int ofs = 0; ofs < targetMap.length; ofs++) { - final int val = in.readVInt(); - if ((val & 0x01) != 0) { - targetMapOffsets[sourceId] = ofs; - sourceId++; - } - accum += val >>> 1; - targetMap[ofs] = accum; + populatePosDict(in, posSize, posDict, inflTypeDict, inflFormDict); } - if (sourceId + 1 != targetMapOffsets.length) - throw new IOException( - "targetMap file format broken; targetMap.length=" - + targetMap.length - + ", targetMapOffsets.length=" - + targetMapOffsets.length - + ", sourceId=" - + sourceId); - targetMapOffsets[sourceId] = targetMap.length; } private static void populatePosDict( @@ -125,32 +67,25 @@ private static void populatePosDict( } } - public void lookupWordIds(int sourceId, IntsRef ref) { - ref.ints = targetMap; - ref.offset = targetMapOffsets[sourceId]; - // targetMapOffsets always has one more entry pointing behind last: - ref.length = targetMapOffsets[sourceId + 1] - ref.offset; - } - @Override - public int getLeftId(int wordId) { - return (buffer.getShort(wordId) & 0xffff) >>> 3; + public int getLeftId(int morphId) { + return (buffer.getShort(morphId) & 0xffff) >>> 3; } @Override - public int getRightId(int wordId) { - return (buffer.getShort(wordId) & 0xffff) >>> 3; + public int getRightId(int morphId) { + return (buffer.getShort(morphId) & 0xffff) >>> 3; } @Override - public int getWordCost(int wordId) { - return buffer.getShort(wordId + 2); // Skip id + public int getWordCost(int morphId) { + return buffer.getShort(morphId + 2); // Skip id } @Override - public String getBaseForm(int wordId, char[] surfaceForm, int off, int len) { - if (hasBaseFormData(wordId)) { - int offset = baseFormOffset(wordId); + public String getBaseForm(int morphId, char[] surfaceForm, int off, int len) { + if (hasBaseFormData(morphId)) { + int offset = baseFormOffset(morphId); int data = buffer.get(offset++) & 0xff; int prefix = data >>> 4; int suffix = data & 0xF; @@ -166,9 +101,9 @@ public String getBaseForm(int wordId, char[] surfaceForm, int off, int len) { } @Override - public String getReading(int wordId, char[] surface, int off, int len) { - if (hasReadingData(wordId)) { - int offset = readingOffset(wordId); + public String getReading(int morphId, char[] surface, int off, int len) { + if (hasReadingData(morphId)) { + int offset = readingOffset(morphId); int readingData = buffer.get(offset++) & 0xff; return readString(offset, readingData >>> 1, (readingData & 1) == 1); } else { @@ -187,24 +122,24 @@ public String getReading(int wordId, char[] surface, int off, int len) { } @Override - public String getPartOfSpeech(int wordId) { - return posDict[getLeftId(wordId)]; + public String getPartOfSpeech(int morphId) { + return posDict[getLeftId(morphId)]; } @Override - public String getPronunciation(int wordId, char[] surface, int off, int len) { - if (hasPronunciationData(wordId)) { - int offset = pronunciationOffset(wordId); + public String getPronunciation(int morphId, char[] surface, int off, int len) { + if (hasPronunciationData(morphId)) { + int offset = pronunciationOffset(morphId); int pronunciationData = buffer.get(offset++) & 0xff; return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1); } else { - return getReading(wordId, surface, off, len); // same as the reading + return getReading(morphId, surface, off, len); // same as the reading } } @Override - public String getInflectionType(int wordId) { - return inflTypeDict[getLeftId(wordId)]; + public String getInflectionType(int morphId) { + return inflTypeDict[getLeftId(morphId)]; } @Override @@ -212,10 +147,6 @@ public String getInflectionForm(int wordId) { return inflFormDict[getLeftId(wordId)]; } - private static int baseFormOffset(int wordId) { - return wordId + 4; - } - private int readingOffset(int wordId) { int offset = baseFormOffset(wordId); if (hasBaseFormData(wordId)) { @@ -242,6 +173,10 @@ private int pronunciationOffset(int wordId) { } } + private static int baseFormOffset(int wordId) { + return wordId + 4; + } + private boolean hasBaseFormData(int wordId) { return (buffer.getShort(wordId) & HAS_BASEFORM) != 0; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java index f9bdc62f49e4..b4a9012e3ee6 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java @@ -20,12 +20,15 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.morph.BinaryDictionary; +import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; /** Dictionary for unknown-word handling. */ -public final class UnknownDictionary extends BinaryDictionary { +public final class UnknownDictionary extends BinaryDictionary { private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance(); + private final UnknownMorphData morphAtts; /** * Create a {@link UnknownDictionary} from an external resource path. @@ -36,25 +39,44 @@ public final class UnknownDictionary extends BinaryDictionary { * @throws IOException if resource was not found or broken */ public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException { - super( + this( () -> Files.newInputStream(targetMapFile), () -> Files.newInputStream(posDictFile), () -> Files.newInputStream(dictFile)); } private UnknownDictionary() throws IOException { - super( + this( () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), () -> getClassResource(POSDICT_FILENAME_SUFFIX), () -> getClassResource(DICT_FILENAME_SUFFIX)); } + private UnknownDictionary( + IOSupplier targetMapResource, + IOSupplier posResource, + IOSupplier dictResource) + throws IOException { + super( + targetMapResource, + dictResource, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new UnknownMorphData(buffer, posResource); + } + private static InputStream getClassResource(String suffix) throws IOException { final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix; return IOUtils.requireResourceNonNull( UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath); } + @Override + public UnknownMorphData getMorphAttributes() { + return morphAtts; + } + public int lookup(char[] text, int offset, int len) { if (!characterDefinition.isGroup(text[offset])) { return 1; @@ -79,21 +101,6 @@ public CharacterDefinition getCharacterDefinition() { return characterDefinition; } - @Override - public String getReading(int wordId, char[] surface, int off, int len) { - return null; - } - - @Override - public String getInflectionType(int wordId) { - return null; - } - - @Override - public String getInflectionForm(int wordId) { - return null; - } - public static UnknownDictionary getInstance() { return SingletonHolder.INSTANCE; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownMorphData.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownMorphData.java new file mode 100644 index 000000000000..0810346e2183 --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownMorphData.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja.dict; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import org.apache.lucene.util.IOSupplier; + +/** Morphological information for unk dictionary. */ +final class UnknownMorphData extends TokenInfoMorphData { + UnknownMorphData(ByteBuffer buffer, IOSupplier posResource) throws IOException { + super(buffer, posResource); + } + + @Override + public String getReading(int morphId, char[] surface, int off, int len) { + return null; + } + + @Override + public String getInflectionType(int morphId) { + return null; + } + + @Override + public String getInflectionForm(int wordId) { + return null; + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 859806e5ef00..12dd27d95845 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -26,13 +26,16 @@ import java.util.Map; import java.util.TreeMap; import org.apache.lucene.analysis.ja.util.CSVUtil; +import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.PositiveIntOutputs; /** Class for building a User Dictionary. This class allows for custom segmentation of phrases. */ -public final class UserDictionary implements Dictionary { +public final class UserDictionary implements Dictionary { + + public static final String INTERNAL_SEPARATOR = "\u0000"; // phrase text -> phrase ID private final TokenInfoFST fst; @@ -41,15 +44,9 @@ public final class UserDictionary implements Dictionary { private final int[][] segmentations; // holds readings and POS, indexed by wordid - private final String[] data; - - private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000; - - public static final int WORD_COST = -100000; + private final UserMorphData morphAtts; - public static final int LEFT_ID = 5; - - public static final int RIGHT_ID = 5; + static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000; public static UserDictionary open(Reader reader) throws IOException { @@ -150,10 +147,15 @@ public int compare(String[] left, String[] right) { ord++; } this.fst = new TokenInfoFST(fstCompiler.compile(), false); - this.data = data.toArray(new String[data.size()]); + this.morphAtts = new UserMorphData(data.toArray(new String[0])); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); } + @Override + public UserMorphData getMorphAttributes() { + return morphAtts; + } + /** * Lookup words in text * @@ -222,78 +224,4 @@ private int[][] toIndexArray(Map input) { public int[] lookupSegmentation(int phraseID) { return segmentations[phraseID]; } - - @Override - public int getLeftId(int wordId) { - return LEFT_ID; - } - - @Override - public int getRightId(int wordId) { - return RIGHT_ID; - } - - @Override - public int getWordCost(int wordId) { - return WORD_COST; - } - - @Override - public String getReading(int wordId, char[] surface, int off, int len) { - return getFeature(wordId, 0); - } - - @Override - public String getPartOfSpeech(int wordId) { - return getFeature(wordId, 1); - } - - @Override - public String getBaseForm(int wordId, char[] surface, int off, int len) { - return null; // TODO: add support? - } - - @Override - public String getPronunciation(int wordId, char[] surface, int off, int len) { - return null; // TODO: add support? - } - - @Override - public String getInflectionType(int wordId) { - return null; // TODO: add support? - } - - @Override - public String getInflectionForm(int wordId) { - return null; // TODO: add support? - } - - private String[] getAllFeaturesArray(int wordId) { - String allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET]; - if (allFeatures == null) { - return null; - } - - return allFeatures.split(INTERNAL_SEPARATOR); - } - - private String getFeature(int wordId, int... fields) { - String[] allFeatures = getAllFeaturesArray(wordId); - if (allFeatures == null) { - return null; - } - StringBuilder sb = new StringBuilder(); - if (fields.length == 0) { // All features - for (String feature : allFeatures) { - sb.append(CSVUtil.quoteEscape(feature)).append(","); - } - } else if (fields.length == 1) { // One feature doesn't need to escape value - sb.append(allFeatures[fields[0]]).append(","); - } else { - for (int field : fields) { - sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(","); - } - } - return sb.deleteCharAt(sb.length() - 1).toString(); - } } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java new file mode 100644 index 000000000000..1f6907052e0f --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja.dict; + +import static org.apache.lucene.analysis.ja.dict.UserDictionary.CUSTOM_DICTIONARY_WORD_ID_OFFSET; +import static org.apache.lucene.analysis.ja.dict.UserDictionary.INTERNAL_SEPARATOR; + +import org.apache.lucene.analysis.ja.util.CSVUtil; + +/** Morphological information for user dictionary. */ +final class UserMorphData implements JaMorphData { + public static final int WORD_COST = -100000; + public static final int LEFT_ID = 5; + public static final int RIGHT_ID = 5; + + // holds readings and POS, indexed by wordid + private final String[] data; + + UserMorphData(String[] data) { + this.data = data; + } + + @Override + public int getLeftId(int wordId) { + return LEFT_ID; + } + + @Override + public int getRightId(int wordId) { + return RIGHT_ID; + } + + @Override + public int getWordCost(int wordId) { + return WORD_COST; + } + + @Override + public String getReading(int morphId, char[] surface, int off, int len) { + return getFeature(morphId, 0); + } + + @Override + public String getPartOfSpeech(int morphId) { + return getFeature(morphId, 1); + } + + @Override + public String getBaseForm(int morphId, char[] surface, int off, int len) { + return null; // TODO: add support? + } + + @Override + public String getPronunciation(int morphId, char[] surface, int off, int len) { + return null; // TODO: add support? + } + + @Override + public String getInflectionType(int morphId) { + return null; // TODO: add support? + } + + @Override + public String getInflectionForm(int wordId) { + return null; // TODO: add support? + } + + private String[] getAllFeaturesArray(int wordId) { + String allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET]; + if (allFeatures == null) { + return null; + } + + return allFeatures.split(INTERNAL_SEPARATOR); + } + + private String getFeature(int wordId, int... fields) { + String[] allFeatures = getAllFeaturesArray(wordId); + if (allFeatures == null) { + return null; + } + StringBuilder sb = new StringBuilder(); + if (fields.length == 0) { // All features + for (String feature : allFeatures) { + sb.append(CSVUtil.quoteEscape(feature)).append(","); + } + } else if (fields.length == 1) { // One feature doesn't need to escape value + sb.append(allFeatures[fields[0]]).append(","); + } else { + for (int field : fields) { + sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(","); + } + } + return sb.deleteCharAt(sb.length() - 1).toString(); + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java deleted file mode 100644 index bf157b9abb25..000000000000 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ja.util; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.WritableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import org.apache.lucene.analysis.ja.dict.BinaryDictionary; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.OutputStreamDataOutput; -import org.apache.lucene.util.ArrayUtil; - -abstract class BinaryDictionaryWriter { - private static final int ID_LIMIT = 8192; - - private final Class implClazz; - protected ByteBuffer buffer; - private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; - private int[] targetMap = new int[8192]; - private int[] targetMapOffsets = new int[8192]; - private final ArrayList posDict = new ArrayList<>(); - - BinaryDictionaryWriter(Class implClazz, int size) { - this.implClazz = implClazz; - buffer = ByteBuffer.allocateDirect(size); - } - - /** - * put the entry in map - * - * @return current position of buffer, which will be wordId of next entry - */ - public int put(String[] entry) { - short leftId = Short.parseShort(entry[1]); - short rightId = Short.parseShort(entry[2]); - short wordCost = Short.parseShort(entry[3]); - - StringBuilder sb = new StringBuilder(); - - // build up the POS string - for (int i = 4; i < 8; i++) { - String part = entry[i]; - assert part.length() > 0; - if (!"*".equals(part)) { - if (sb.length() > 0) { - sb.append('-'); - } - sb.append(part); - } - } - - String posData = sb.toString(); - if (posData.isEmpty()) { - throw new IllegalArgumentException("POS fields are empty"); - } - sb.setLength(0); - sb.append(CSVUtil.quoteEscape(posData)); - sb.append(','); - if (!"*".equals(entry[8])) { - sb.append(CSVUtil.quoteEscape(entry[8])); - } - sb.append(','); - if (!"*".equals(entry[9])) { - sb.append(CSVUtil.quoteEscape(entry[9])); - } - String fullPOSData = sb.toString(); - - String baseForm = entry[10]; - String reading = entry[11]; - String pronunciation = entry[12]; - - // extend buffer if necessary - int left = buffer.remaining(); - // worst case: two short, 3 bytes, and features (all as utf-16) - int worstCase = 4 + 3 + 2 * (baseForm.length() + reading.length() + pronunciation.length()); - if (worstCase > left) { - ByteBuffer newBuffer = - ByteBuffer.allocateDirect(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1)); - buffer.flip(); - newBuffer.put(buffer); - buffer = newBuffer; - } - - int flags = 0; - if (baseForm.isEmpty()) { - throw new IllegalArgumentException("base form is empty"); - } - if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) { - flags |= BinaryDictionary.HAS_BASEFORM; - } - if (!reading.equals(toKatakana(entry[0]))) { - flags |= BinaryDictionary.HAS_READING; - } - if (!pronunciation.equals(reading)) { - flags |= BinaryDictionary.HAS_PRONUNCIATION; - } - - if (leftId != rightId) { - throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId); - } - if (leftId >= ID_LIMIT) { - throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId); - } - // add pos mapping - int toFill = 1 + leftId - posDict.size(); - for (int i = 0; i < toFill; i++) { - posDict.add(null); - } - - String existing = posDict.get(leftId); - if (existing != null && existing.equals(fullPOSData) == false) { - // TODO: test me - throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId); - } - posDict.set(leftId, fullPOSData); - - buffer.putShort((short) (leftId << 3 | flags)); - buffer.putShort(wordCost); - - if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) { - if (baseForm.length() >= 16) { - throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16"); - } - int shared = sharedPrefix(entry[0], baseForm); - int suffix = baseForm.length() - shared; - buffer.put((byte) (shared << 4 | suffix)); - for (int i = shared; i < baseForm.length(); i++) { - buffer.putChar(baseForm.charAt(i)); - } - } - - if ((flags & BinaryDictionary.HAS_READING) != 0) { - if (isKatakana(reading)) { - buffer.put((byte) (reading.length() << 1 | 1)); - writeKatakana(reading); - } else { - buffer.put((byte) (reading.length() << 1)); - for (int i = 0; i < reading.length(); i++) { - buffer.putChar(reading.charAt(i)); - } - } - } - - if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) { - // we can save 150KB here, but it makes the reader a little complicated. - // int shared = sharedPrefix(reading, pronunciation); - // buffer.put((byte) shared); - // pronunciation = pronunciation.substring(shared); - if (isKatakana(pronunciation)) { - buffer.put((byte) (pronunciation.length() << 1 | 1)); - writeKatakana(pronunciation); - } else { - buffer.put((byte) (pronunciation.length() << 1)); - for (int i = 0; i < pronunciation.length(); i++) { - buffer.putChar(pronunciation.charAt(i)); - } - } - } - - return buffer.position(); - } - - private boolean isKatakana(String s) { - for (int i = 0; i < s.length(); i++) { - char ch = s.charAt(i); - if (ch < 0x30A0 || ch > 0x30FF) { - return false; - } - } - return true; - } - - private void writeKatakana(String s) { - for (int i = 0; i < s.length(); i++) { - buffer.put((byte) (s.charAt(i) - 0x30A0)); - } - } - - private String toKatakana(String s) { - char[] text = new char[s.length()]; - for (int i = 0; i < s.length(); i++) { - char ch = s.charAt(i); - if (ch > 0x3040 && ch < 0x3097) { - text[i] = (char) (ch + 0x60); - } else { - text[i] = ch; - } - } - return new String(text); - } - - private static int sharedPrefix(String left, String right) { - int len = left.length() < right.length() ? left.length() : right.length(); - for (int i = 0; i < len; i++) if (left.charAt(i) != right.charAt(i)) return i; - return len; - } - - void addMapping(int sourceId, int wordId) { - if (wordId <= lastWordId) { - throw new IllegalStateException( - "words out of order: " + wordId + " vs lastID: " + lastWordId); - } - - if (sourceId > lastSourceId) { - targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1); - for (int i = lastSourceId + 1; i <= sourceId; i++) { - targetMapOffsets[i] = targetMapEndOffset; - } - } else if (sourceId != lastSourceId) { - throw new IllegalStateException( - "source ids not in increasing order: lastSourceId=" - + lastSourceId - + " vs sourceId=" - + sourceId); - } - - targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1); - targetMap[targetMapEndOffset] = wordId; - targetMapEndOffset++; - - lastSourceId = sourceId; - lastWordId = wordId; - } - - final String getBaseFileName() { - return implClazz.getName().replace('.', '/'); - } - - /** - * Write dictionary in file Dictionary format is: [Size of dictionary(int)], [entry:{left - * id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], - * [entry...], [entry...]..... - * - * @throws IOException if an I/O error occurs writing the dictionary files - */ - public void write(Path baseDir) throws IOException { - final String baseName = getBaseFileName(); - writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX)); - writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX)); - writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX)); - } - - // TODO: maybe this int[] should instead be the output to the FST... - private void writeTargetMap(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION); - - final int numSourceIds = lastSourceId + 1; - out.writeVInt(targetMapEndOffset); // <-- size of main array - out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry) - int prev = 0, sourceId = 0; - for (int ofs = 0; ofs < targetMapEndOffset; ofs++) { - final int val = targetMap[ofs], delta = val - prev; - assert delta >= 0; - if (ofs == targetMapOffsets[sourceId]) { - out.writeVInt((delta << 1) | 0x01); - sourceId++; - } else { - out.writeVInt((delta << 1)); - } - prev += delta; - } - if (sourceId != numSourceIds) { - throw new IllegalStateException( - "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); - } - } - } - - private void writePosDict(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION); - out.writeVInt(posDict.size()); - for (String s : posDict) { - if (s == null) { - out.writeByte((byte) 0); - out.writeByte((byte) 0); - out.writeByte((byte) 0); - } else { - String[] data = CSVUtil.parse(s); - if (data.length != 3) { - throw new IllegalArgumentException( - "Malformed pos/inflection: " + s + "; expected 3 characters"); - } - out.writeString(data[0]); - out.writeString(data[1]); - out.writeString(data[2]); - } - } - } - } - - private void writeDictionary(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION); - out.writeVInt(buffer.position()); - final WritableByteChannel channel = Channels.newChannel(bos); - // Write Buffer - buffer.flip(); // set position to 0, set limit to current position - channel.write(buffer); - assert buffer.remaining() == 0L; - } - } -} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java deleted file mode 100644 index 0afadeda83df..000000000000 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ja.util; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import org.apache.lucene.analysis.ja.dict.CharacterDefinition; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.OutputStreamDataOutput; - -final class CharacterDefinitionWriter { - - private final byte[] characterCategoryMap = new byte[0x10000]; - - private final boolean[] invokeMap = new boolean[CharacterDefinition.CLASS_COUNT]; - private final boolean[] groupMap = new boolean[CharacterDefinition.CLASS_COUNT]; - - /** Constructor for building. TODO: remove write access */ - CharacterDefinitionWriter() { - Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT); - } - - /** - * Put mapping from unicode code point to character class. - * - * @param codePoint code point - * @param characterClassName character class name - */ - void putCharacterCategory(int codePoint, String characterClassName) { - characterClassName = characterClassName.split(" ")[0]; // use first - // category - // class - - // Override Nakaguro - if (codePoint == 0x30FB) { - characterClassName = "SYMBOL"; - } - characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName); - } - - void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { - final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName); - invokeMap[characterClass] = invoke == 1; - groupMap[characterClass] = group == 1; - // TODO: length def ignored - } - - public void write(Path baseDir) throws IOException { - Path path = - baseDir.resolve( - CharacterDefinition.class.getName().replace('.', '/') - + CharacterDefinition.FILENAME_SUFFIX); - Files.createDirectories(path.getParent()); - try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { - final DataOutput out = new OutputStreamDataOutput(os); - CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION); - out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length); - for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++) { - final byte b = (byte) ((invokeMap[i] ? 0x01 : 0x00) | (groupMap[i] ? 0x02 : 0x00)); - out.writeByte(b); - } - } - } -} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java index 8b518ee991a0..bfd8dd0d17c9 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java @@ -22,12 +22,14 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.ja.dict.ConnectionCosts; +import org.apache.lucene.analysis.morph.ConnectionCostsWriter; class ConnectionCostsBuilder { private ConnectionCostsBuilder() {} - public static ConnectionCostsWriter build(Path path) throws IOException { + public static ConnectionCostsWriter build(Path path) throws IOException { try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII); LineNumberReader lineReader = new LineNumberReader(reader)) { @@ -41,7 +43,8 @@ public static ConnectionCostsWriter build(Path path) throws IOException { assert forwardSize > 0 && backwardSize > 0; - ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize); + ConnectionCostsWriter costs = + new ConnectionCostsWriter<>(ConnectionCosts.class, forwardSize, backwardSize); while ((line = lineReader.readLine()) != null) { String[] fields = line.split("\\s+"); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java index 72920baf9840..d547dcb122d9 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java @@ -20,6 +20,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.Locale; +import org.apache.lucene.analysis.ja.dict.DictionaryConstants; /** * Tool to build dictionaries. Usage: @@ -68,7 +69,8 @@ public static void build( new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir); - ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")).write(outputDir); + ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")) + .write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION); } public static void main(String[] args) throws IOException { diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryEntryWriter.java new file mode 100644 index 000000000000..618047825f2e --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryEntryWriter.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja.util; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import org.apache.lucene.analysis.ja.dict.TokenInfoMorphData; +import org.apache.lucene.analysis.morph.DictionaryEntryWriter; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; + +/** Writes system dictionary entries */ +class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter { + private static final int ID_LIMIT = 8192; + + TokenInfoDictionaryEntryWriter(int size) { + super(size); + } + + /** + * put the entry in map + * + *

mecab-ipadic features + * + *

+   * 0   - surface
+   * 1   - left cost
+   * 2   - right cost
+   * 3   - word cost
+   * 4-9 - pos
+   * 10  - base form
+   * 11  - reading
+   * 12  - pronounciation
+   * 
+ */ + @Override + protected int putEntry(String[] entry) { + short leftId = Short.parseShort(entry[1]); + short rightId = Short.parseShort(entry[2]); + short wordCost = Short.parseShort(entry[3]); + + StringBuilder sb = new StringBuilder(); + + // build up the POS string + for (int i = 4; i < 8; i++) { + String part = entry[i]; + assert part.length() > 0; + if (!"*".equals(part)) { + if (sb.length() > 0) { + sb.append('-'); + } + sb.append(part); + } + } + + String posData = sb.toString(); + if (posData.isEmpty()) { + throw new IllegalArgumentException("POS fields are empty"); + } + sb.setLength(0); + sb.append(CSVUtil.quoteEscape(posData)); + sb.append(','); + if (!"*".equals(entry[8])) { + sb.append(CSVUtil.quoteEscape(entry[8])); + } + sb.append(','); + if (!"*".equals(entry[9])) { + sb.append(CSVUtil.quoteEscape(entry[9])); + } + String fullPOSData = sb.toString(); + + String baseForm = entry[10]; + String reading = entry[11]; + String pronunciation = entry[12]; + + // extend buffer if necessary + int left = buffer.remaining(); + // worst case: two short, 3 bytes, and features (all as utf-16) + int worstCase = 4 + 3 + 2 * (baseForm.length() + reading.length() + pronunciation.length()); + if (worstCase > left) { + ByteBuffer newBuffer = + ByteBuffer.allocateDirect(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1)); + buffer.flip(); + newBuffer.put(buffer); + buffer = newBuffer; + } + + int flags = 0; + if (baseForm.isEmpty()) { + throw new IllegalArgumentException("base form is empty"); + } + if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) { + flags |= TokenInfoMorphData.HAS_BASEFORM; + } + if (!reading.equals(toKatakana(entry[0]))) { + flags |= TokenInfoMorphData.HAS_READING; + } + if (!pronunciation.equals(reading)) { + flags |= TokenInfoMorphData.HAS_PRONUNCIATION; + } + + if (leftId != rightId) { + throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId); + } + if (leftId >= ID_LIMIT) { + throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId); + } + // add pos mapping + int toFill = 1 + leftId - posDict.size(); + for (int i = 0; i < toFill; i++) { + posDict.add(null); + } + + String existing = posDict.get(leftId); + if (existing != null && existing.equals(fullPOSData) == false) { + // TODO: test me + throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId); + } + posDict.set(leftId, fullPOSData); + + buffer.putShort((short) (leftId << 3 | flags)); + buffer.putShort(wordCost); + + if ((flags & TokenInfoMorphData.HAS_BASEFORM) != 0) { + if (baseForm.length() >= 16) { + throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16"); + } + int shared = sharedPrefix(entry[0], baseForm); + int suffix = baseForm.length() - shared; + buffer.put((byte) (shared << 4 | suffix)); + for (int i = shared; i < baseForm.length(); i++) { + buffer.putChar(baseForm.charAt(i)); + } + } + + if ((flags & TokenInfoMorphData.HAS_READING) != 0) { + if (isKatakana(reading)) { + buffer.put((byte) (reading.length() << 1 | 1)); + writeKatakana(reading, buffer); + } else { + buffer.put((byte) (reading.length() << 1)); + for (int i = 0; i < reading.length(); i++) { + buffer.putChar(reading.charAt(i)); + } + } + } + + if ((flags & TokenInfoMorphData.HAS_PRONUNCIATION) != 0) { + // we can save 150KB here, but it makes the reader a little complicated. + // int shared = sharedPrefix(reading, pronunciation); + // buffer.put((byte) shared); + // pronunciation = pronunciation.substring(shared); + if (isKatakana(pronunciation)) { + buffer.put((byte) (pronunciation.length() << 1 | 1)); + writeKatakana(pronunciation, buffer); + } else { + buffer.put((byte) (pronunciation.length() << 1)); + for (int i = 0; i < pronunciation.length(); i++) { + buffer.putChar(pronunciation.charAt(i)); + } + } + } + + return buffer.position(); + } + + private boolean isKatakana(String s) { + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch < 0x30A0 || ch > 0x30FF) { + return false; + } + } + return true; + } + + private void writeKatakana(String s, ByteBuffer buffer) { + for (int i = 0; i < s.length(); i++) { + buffer.put((byte) (s.charAt(i) - 0x30A0)); + } + } + + private String toKatakana(String s) { + char[] text = new char[s.length()]; + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch > 0x3040 && ch < 0x3097) { + text[i] = (char) (ch + 0x60); + } else { + text[i] = ch; + } + } + return new String(text); + } + + private static int sharedPrefix(String left, String right) { + int len = left.length() < right.length() ? left.length() : right.length(); + for (int i = 0; i < len; i++) if (left.charAt(i) != right.charAt(i)) return i; + return len; + } + + @Override + protected void writePosDict(OutputStream bos, DataOutput out) throws IOException { + out.writeVInt(posDict.size()); + for (String s : posDict) { + if (s == null) { + out.writeByte((byte) 0); + out.writeByte((byte) 0); + out.writeByte((byte) 0); + } else { + String[] data = CSVUtil.parse(s); + if (data.length != 3) { + throw new IllegalArgumentException( + "Malformed pos/inflection: " + s + "; expected 3 characters"); + } + out.writeString(data[0]); + out.writeString(data[1]); + out.writeString(data[2]); + } + } + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java index 400c834cadfc..fbdf65f34ce7 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java @@ -20,14 +20,16 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Objects; +import org.apache.lucene.analysis.ja.dict.DictionaryConstants; import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary; import org.apache.lucene.util.fst.FST; -class TokenInfoDictionaryWriter extends BinaryDictionaryWriter { +class TokenInfoDictionaryWriter + extends org.apache.lucene.analysis.morph.BinaryDictionaryWriter { private FST fst; TokenInfoDictionaryWriter(int size) { - super(TokenInfoDictionary.class, size); + super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size)); } public void setFST(FST fst) { @@ -35,9 +37,19 @@ public void setFST(FST fst) { this.fst = fst; } + @Override + protected void addMapping(int sourceId, int wordId) { + super.addMapping(sourceId, wordId); + } + @Override public void write(Path baseDir) throws IOException { - super.write(baseDir); + super.write( + baseDir, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX)); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java index 6d80f513feca..84000fdf0de3 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java @@ -19,19 +19,28 @@ import java.io.IOException; import java.nio.file.Path; import org.apache.lucene.analysis.ja.dict.CharacterDefinition; +import org.apache.lucene.analysis.ja.dict.DictionaryConstants; import org.apache.lucene.analysis.ja.dict.UnknownDictionary; +import org.apache.lucene.analysis.morph.BinaryDictionaryWriter; +import org.apache.lucene.analysis.morph.CharacterDefinitionWriter; -class UnknownDictionaryWriter extends BinaryDictionaryWriter { - private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter(); +class UnknownDictionaryWriter extends BinaryDictionaryWriter { + private final CharacterDefinitionWriter characterDefinition = + new CharacterDefinitionWriter<>( + CharacterDefinition.class, + CharacterDefinition.DEFAULT, + CharacterDefinition.CLASS_COUNT, + CharacterDefinition::lookupCharacterClass); public UnknownDictionaryWriter(int size) { - super(UnknownDictionary.class, size); + super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size)); } @Override public int put(String[] entry) { // Get wordId of current entry - int wordId = buffer.position(); + // int wordId = buffer.position(); + int wordId = entryWriter.currentPosition(); // Put entry int result = super.put(entry); @@ -58,7 +67,13 @@ public void putInvokeDefinition(String characterClassName, int invoke, int group @Override public void write(Path baseDir) throws IOException { - super.write(baseDir); - characterDefinition.write(baseDir); + super.write( + baseDir, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + characterDefinition.write( + baseDir, DictionaryConstants.CHARDEF_HEADER, DictionaryConstants.VERSION); } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java index bc44723996ac..5423d7a76cbf 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java @@ -16,10 +16,10 @@ */ package org.apache.lucene.analysis.ja.dict; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import static org.apache.lucene.analysis.ja.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import java.io.BufferedWriter; import java.nio.charset.StandardCharsets; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java index 11de60c05fb4..0320d6e5f13b 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java @@ -16,10 +16,10 @@ */ package org.apache.lucene.analysis.ja.dict; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import static org.apache.lucene.analysis.ja.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -137,17 +137,17 @@ public void testEnumerateAll() throws Exception { assertTrue(wordId > lastWordId); lastWordId = wordId; - String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); + String baseForm = tid.getMorphAttributes().getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); - String inflectionForm = tid.getInflectionForm(wordId); + String inflectionForm = tid.getMorphAttributes().getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that it's actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } - String inflectionType = tid.getInflectionType(wordId); + String inflectionType = tid.getMorphAttributes().getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that it's actually an ipadic inflection type @@ -161,17 +161,18 @@ public void testEnumerateAll() throws Exception { tid.getWordCost(wordId); - String pos = tid.getPartOfSpeech(wordId); + String pos = tid.getMorphAttributes().getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that it's actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); - String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); + String pronunciation = + tid.getMorphAttributes().getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); - String reading = tid.getReading(wordId, chars, 0, chars.length); + String reading = tid.getMorphAttributes().getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUserDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUserDictionary.java index b08d75344f0f..537d621016e6 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUserDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUserDictionary.java @@ -54,12 +54,15 @@ public void testReadings() throws IOException { int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6); assertEquals(3, result.length); int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞 - assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2)); + assertEquals( + "ニホン", dictionary.getMorphAttributes().getReading(wordIdNihon, "日本".toCharArray(), 0, 2)); result = dictionary.lookup("朝青龍".toCharArray(), 0, 3); assertEquals(1, result.length); int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍 - assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3)); + assertEquals( + "アサショウリュウ", + dictionary.getMorphAttributes().getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3)); } @Test @@ -68,7 +71,7 @@ public void testPartOfSpeech() throws IOException { int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6); assertEquals(3, result.length); int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞 - assertEquals("カスタム名詞", dictionary.getPartOfSpeech(wordIdKeizai)); + assertEquals("カスタム名詞", dictionary.getMorphAttributes().getPartOfSpeech(wordIdKeizai)); } @Test @@ -109,7 +112,7 @@ public void testSharp() throws IOException { for (String input : inputs) { System.out.println(input); int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length()); - assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0])); + assertEquals("カスタム名刺", dictionary.getMorphAttributes().getPartOfSpeech(result[0][0])); } } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java index 731378c9c4ae..a6af55f482e4 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.analysis.ko; -import org.apache.lucene.analysis.ko.dict.Dictionary; +import org.apache.lucene.analysis.ko.dict.KoMorphData; /** A token that was generated from a compound. */ public class DecompoundToken extends Token { @@ -71,7 +71,7 @@ public String getReading() { } @Override - public Dictionary.Morpheme[] getMorphemes() { + public KoMorphData.Morpheme[] getMorphemes() { return null; } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java index 82fe981c24df..f548f8007d30 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java @@ -16,17 +16,17 @@ */ package org.apache.lucene.analysis.ko; -import org.apache.lucene.analysis.ko.dict.Dictionary; +import org.apache.lucene.analysis.ko.dict.KoMorphData; -/** A token stored in a {@link Dictionary}. */ +/** A token stored in a {@link KoMorphData}. */ public class DictionaryToken extends Token { private final int wordId; private final KoreanTokenizer.Type type; - private final Dictionary dictionary; + private final KoMorphData morphAtts; public DictionaryToken( KoreanTokenizer.Type type, - Dictionary dictionary, + KoMorphData morphAtts, int wordId, char[] surfaceForm, int offset, @@ -35,7 +35,7 @@ public DictionaryToken( int endOffset) { super(surfaceForm, offset, length, startOffset, endOffset); this.type = type; - this.dictionary = dictionary; + this.morphAtts = morphAtts; this.wordId = wordId; } @@ -54,7 +54,7 @@ public String toString() { + " wordId=" + wordId + " leftID=" - + dictionary.getLeftId(wordId) + + morphAtts.getLeftId(wordId) + ")"; } @@ -96,26 +96,26 @@ public boolean isUser() { @Override public POS.Type getPOSType() { - return dictionary.getPOSType(wordId); + return morphAtts.getPOSType(wordId); } @Override public POS.Tag getLeftPOS() { - return dictionary.getLeftPOS(wordId); + return morphAtts.getLeftPOS(wordId); } @Override public POS.Tag getRightPOS() { - return dictionary.getRightPOS(wordId); + return morphAtts.getRightPOS(wordId); } @Override public String getReading() { - return dictionary.getReading(wordId); + return morphAtts.getReading(wordId); } @Override - public Dictionary.Morpheme[] getMorphemes() { - return dictionary.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength()); + public KoMorphData.Morpheme[] getMorphemes() { + return morphAtts.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength()); } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java index d8e3f7a9fafc..9beaf2e5b9b5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java @@ -21,7 +21,8 @@ import org.apache.lucene.analysis.ko.KoreanTokenizer.Position; import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray; import org.apache.lucene.analysis.ko.dict.ConnectionCosts; -import org.apache.lucene.analysis.ko.dict.Dictionary; +import org.apache.lucene.analysis.ko.dict.KoMorphData; +import org.apache.lucene.analysis.morph.Dictionary; // TODO: would be nice to show 2nd best path in a diff't // color... @@ -140,7 +141,7 @@ private String formatNodes( attrs = ""; } - final Dictionary dict = tok.getDict(posData.backType[idx]); + final Dictionary dict = tok.getDict(posData.backType[idx]); final int wordCost = dict.getWordCost(posData.backID[idx]); final int bgCost = costs.get( diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java index 325fae710b90..028d9dd54d1f 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java @@ -26,13 +26,14 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ko.dict.CharacterDefinition; import org.apache.lucene.analysis.ko.dict.ConnectionCosts; -import org.apache.lucene.analysis.ko.dict.Dictionary; +import org.apache.lucene.analysis.ko.dict.KoMorphData; import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary; import org.apache.lucene.analysis.ko.dict.TokenInfoFST; import org.apache.lucene.analysis.ko.dict.UnknownDictionary; import org.apache.lucene.analysis.ko.dict.UserDictionary; import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute; import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute; +import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -97,7 +98,8 @@ public enum DecompoundMode { private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; private static final int MAX_BACKTRACE_GAP = 1024; - private final EnumMap dictionaryMap = new EnumMap<>(Type.class); + private final EnumMap> dictionaryMap = + new EnumMap<>(Type.class); private final TokenInfoFST fst; private final TokenInfoDictionary dictionary; @@ -406,10 +408,10 @@ private int computeSpacePenalty(POS.Tag leftPOS, int numSpaces) { } private void add( - Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) { - final POS.Tag leftPOS = dict.getLeftPOS(wordID); - final int wordCost = dict.getWordCost(wordID); - final int leftID = dict.getLeftId(wordID); + KoMorphData morphAtts, Position fromPosData, int wordPos, int endPos, int wordID, Type type) { + final POS.Tag leftPOS = morphAtts.getLeftPOS(wordID); + final int wordCost = morphAtts.getWordCost(wordID); + final int leftID = morphAtts.getLeftId(wordID); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; assert fromPosData.count > 0; @@ -472,7 +474,14 @@ private void add( positions .get(endPos) - .add(leastCost, dict.getRightId(wordID), fromPosData.pos, wordPos, leastIDX, wordID, type); + .add( + leastCost, + morphAtts.getRightId(wordID), + fromPosData.pos, + wordPos, + leastIDX, + wordID, + type); } @Override @@ -796,7 +805,7 @@ private void parse() throws IOException { + (maxPosAhead + 1)); } add( - userDictionary, + userDictionary.getMorphAttributes(), posData, pos, maxPosAhead + 1, @@ -848,7 +857,7 @@ private void parse() throws IOException { } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( - dictionary, + dictionary.getMorphAttributes(), posData, pos, posAhead + 1, @@ -922,7 +931,7 @@ && isPunctuation(ch, chType) == isPunct } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( - unkDictionary, + unkDictionary.getMorphAttributes(), posData, pos, pos + unknownWordLength, @@ -1016,7 +1025,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { final int fragmentOffset = backWordPos - lastBackTracePos; assert fragmentOffset >= 0; - final Dictionary dict = getDict(backType); + final Dictionary dict = getDict(backType); if (outputUnknownUnigrams && backType == Type.UNKNOWN) { // outputUnknownUnigrams converts unknown word into unigrams: @@ -1029,7 +1038,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { final DictionaryToken token = new DictionaryToken( Type.UNKNOWN, - unkDictionary, + unkDictionary.getMorphAttributes(), CharacterDefinition.NGRAM, fragment, fragmentOffset + i, @@ -1045,7 +1054,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { final DictionaryToken token = new DictionaryToken( backType, - dict, + dict.getMorphAttributes(), backID, fragment, fragmentOffset, @@ -1060,7 +1069,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { } } } else { - Dictionary.Morpheme[] morphemes = token.getMorphemes(); + KoMorphData.Morpheme[] morphemes = token.getMorphemes(); if (morphemes == null) { pending.add(token); if (VERBOSE) { @@ -1071,7 +1080,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { int posLen = 0; // decompose the compound for (int i = morphemes.length - 1; i >= 0; i--) { - final Dictionary.Morpheme morpheme = morphemes[i]; + final KoMorphData.Morpheme morpheme = morphemes[i]; final Token compoundToken; if (token.getPOSType() == POS.Type.COMPOUND) { assert endOffset - morpheme.surfaceForm.length() >= 0; @@ -1119,7 +1128,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { DictionaryToken spaceToken = new DictionaryToken( Type.UNKNOWN, - unkDictionary, + unkDictionary.getMorphAttributes(), wordIdRef.ints[wordIdRef.offset], fragment, offset, @@ -1144,7 +1153,7 @@ private void backtrace(final Position endPosData, final int fromIDX) { positions.freeBefore(endPos); } - Dictionary getDict(Type type) { + Dictionary getDict(Type type) { return dictionaryMap.get(type); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java index 730262907d89..ed2d7ff6ba73 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.analysis.ko; -import org.apache.lucene.analysis.ko.dict.Dictionary.Morpheme; +import org.apache.lucene.analysis.ko.dict.KoMorphData; /** Analyzed token with morphological data. */ public abstract class Token { @@ -70,8 +70,11 @@ public String getSurfaceFormString() { /** Get the reading of the token. */ public abstract String getReading(); - /** Get the {@link Morpheme} decomposition of the token. */ - public abstract Morpheme[] getMorphemes(); + /** + * Get the {@link org.apache.lucene.analysis.ko.dict.KoMorphData.Morpheme} decomposition of the + * token. + */ + public abstract KoMorphData.Morpheme[] getMorphemes(); /** Get the start offset of the term in the analyzed text. */ public int getStartOffset() { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java deleted file mode 100644 index 4d10e5200556..000000000000 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.dict; - -import java.io.BufferedInputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.ReadableByteChannel; -import org.apache.lucene.analysis.ko.POS; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.util.IOSupplier; -import org.apache.lucene.util.IntsRef; - -/** Base class for a binary-encoded in-memory dictionary. */ -public abstract class BinaryDictionary implements Dictionary { - - public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; - public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; - public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; - - public static final String DICT_HEADER = "ko_dict"; - public static final String TARGETMAP_HEADER = "ko_dict_map"; - public static final String POSDICT_HEADER = "ko_dict_pos"; - public static final int VERSION = 1; - - private final ByteBuffer buffer; - private final int[] targetMapOffsets, targetMap; - private final POS.Tag[] posDict; - - protected BinaryDictionary( - IOSupplier targetMapResource, - IOSupplier posResource, - IOSupplier dictResource) - throws IOException { - try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) { - DataInput in = new InputStreamDataInput(mapIS); - CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); - this.targetMap = new int[in.readVInt()]; - this.targetMapOffsets = new int[in.readVInt()]; - populateTargetMap(in, this.targetMap, this.targetMapOffsets); - } - - try (InputStream posIS = new BufferedInputStream(posResource.get())) { - DataInput in = new InputStreamDataInput(posIS); - CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); - int posSize = in.readVInt(); - this.posDict = new POS.Tag[posSize]; - for (int j = 0; j < posSize; j++) { - posDict[j] = POS.resolveTag(in.readByte()); - } - } - - // no buffering here, as we load in one large buffer - try (InputStream dictIS = dictResource.get()) { - DataInput in = new InputStreamDataInput(dictIS); - CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); - final int size = in.readVInt(); - final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); - final ReadableByteChannel channel = Channels.newChannel(dictIS); - final int read = channel.read(tmpBuffer); - if (read != size) { - throw new EOFException("Cannot read whole dictionary"); - } - this.buffer = tmpBuffer.asReadOnlyBuffer(); - } - } - - private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets) - throws IOException { - int accum = 0, sourceId = 0; - for (int ofs = 0; ofs < targetMap.length; ofs++) { - final int val = in.readVInt(); - if ((val & 0x01) != 0) { - targetMapOffsets[sourceId] = ofs; - sourceId++; - } - accum += val >>> 1; - targetMap[ofs] = accum; - } - if (sourceId + 1 != targetMapOffsets.length) - throw new IOException( - "targetMap file format broken; targetMap.length=" - + targetMap.length - + ", targetMapOffsets.length=" - + targetMapOffsets.length - + ", sourceId=" - + sourceId); - targetMapOffsets[sourceId] = targetMap.length; - } - - public void lookupWordIds(int sourceId, IntsRef ref) { - ref.ints = targetMap; - ref.offset = targetMapOffsets[sourceId]; - // targetMapOffsets always has one more entry pointing behind last: - ref.length = targetMapOffsets[sourceId + 1] - ref.offset; - } - - @Override - public int getLeftId(int wordId) { - return buffer.getShort(wordId) >>> 2; - } - - @Override - public int getRightId(int wordId) { - return buffer.getShort(wordId + 2) >>> 2; // Skip left id - } - - @Override - public int getWordCost(int wordId) { - return buffer.getShort(wordId + 4); // Skip left and right id - } - - @Override - public POS.Type getPOSType(int wordId) { - byte value = (byte) (buffer.getShort(wordId) & 3); - return POS.resolveType(value); - } - - @Override - public POS.Tag getLeftPOS(int wordId) { - return posDict[getLeftId(wordId)]; - } - - @Override - public POS.Tag getRightPOS(int wordId) { - POS.Type type = getPOSType(wordId); - if (type == POS.Type.MORPHEME || type == POS.Type.COMPOUND || hasSinglePOS(wordId)) { - return getLeftPOS(wordId); - } else { - byte value = buffer.get(wordId + 6); - return POS.resolveTag(value); - } - } - - @Override - public String getReading(int wordId) { - if (hasReadingData(wordId)) { - int offset = wordId + 6; - return readString(offset); - } - return null; - } - - @Override - public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { - POS.Type posType = getPOSType(wordId); - if (posType == POS.Type.MORPHEME) { - return null; - } - int offset = wordId + 6; - boolean hasSinglePos = hasSinglePOS(wordId); - if (hasSinglePos == false) { - offset++; // skip rightPOS - } - int length = buffer.get(offset++); - if (length == 0) { - return null; - } - Morpheme[] morphemes = new Morpheme[length]; - int surfaceOffset = 0; - final POS.Tag leftPOS = getLeftPOS(wordId); - for (int i = 0; i < length; i++) { - final String form; - final POS.Tag tag = hasSinglePos ? leftPOS : POS.resolveTag(buffer.get(offset++)); - if (posType == POS.Type.INFLECT) { - form = readString(offset); - offset += form.length() * 2 + 1; - } else { - int formLen = buffer.get(offset++); - form = new String(surfaceForm, off + surfaceOffset, formLen); - surfaceOffset += formLen; - } - morphemes[i] = new Morpheme(tag, form); - } - return morphemes; - } - - private String readString(int offset) { - int strOffset = offset; - int len = buffer.get(strOffset++); - char[] text = new char[len]; - for (int i = 0; i < len; i++) { - text[i] = buffer.getChar(strOffset + (i << 1)); - } - return new String(text); - } - - private boolean hasSinglePOS(int wordId) { - return (buffer.getShort(wordId + 2) & HAS_SINGLE_POS) != 0; - } - - private boolean hasReadingData(int wordId) { - return (buffer.getShort(wordId + 2) & HAS_READING) != 0; - } - - /** flag that the entry has a single part of speech (leftPOS) */ - public static final int HAS_SINGLE_POS = 1; - - /** flag that the entry has reading data. otherwise reading is surface form */ - public static final int HAS_READING = 2; -} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java index 5e2e48932bab..bcdcfde2f064 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java @@ -16,20 +16,13 @@ */ package org.apache.lucene.analysis.ko.dict; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOUtils; /** Character category data. */ -public final class CharacterDefinition { - - public static final String FILENAME_SUFFIX = ".dat"; - public static final String HEADER = "ko_cd"; - public static final int VERSION = 1; +public final class CharacterDefinition + extends org.apache.lucene.analysis.morph.CharacterDefinition { public static final int CLASS_COUNT = CharacterClass.values().length; @@ -51,11 +44,6 @@ enum CharacterClass { HANJANUMERIC } - private final byte[] characterCategoryMap = new byte[0x10000]; - - private final boolean[] invokeMap = new boolean[CLASS_COUNT]; - private final boolean[] groupMap = new boolean[CLASS_COUNT]; - // the classes: public static final byte NGRAM = (byte) CharacterClass.NGRAM.ordinal(); public static final byte DEFAULT = (byte) CharacterClass.DEFAULT.ordinal(); @@ -73,16 +61,11 @@ enum CharacterClass { public static final byte HANJANUMERIC = (byte) CharacterClass.HANJANUMERIC.ordinal(); private CharacterDefinition() throws IOException { - try (InputStream is = new BufferedInputStream(getClassResource())) { - final DataInput in = new InputStreamDataInput(is); - CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); - in.readBytes(characterCategoryMap, 0, characterCategoryMap.length); - for (int i = 0; i < CLASS_COUNT; i++) { - final byte b = in.readByte(); - invokeMap[i] = (b & 0x01) != 0; - groupMap[i] = (b & 0x02) != 0; - } - } + super( + CharacterDefinition::getClassResource, + DictionaryConstants.CHARDEF_HEADER, + DictionaryConstants.VERSION, + CharacterDefinition.CLASS_COUNT); } private static InputStream getClassResource() throws IOException { @@ -91,18 +74,6 @@ private static InputStream getClassResource() throws IOException { CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath); } - public byte getCharacterClass(char c) { - return characterCategoryMap[c]; - } - - public boolean isInvoke(char c) { - return invokeMap[characterCategoryMap[c]]; - } - - public boolean isGroup(char c) { - return groupMap[characterCategoryMap[c]]; - } - public boolean isHanja(char c) { final byte characterClass = getCharacterClass(c); return characterClass == HANJA || characterClass == HANJANUMERIC; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java index 61579aef8450..3b13a86b90a7 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java @@ -16,27 +16,15 @@ */ package org.apache.lucene.analysis.ko.dict; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; /** n-gram connection cost data */ -public final class ConnectionCosts { - - public static final String FILENAME_SUFFIX = ".dat"; - public static final String HEADER = "ko_cc"; - public static final int VERSION = 1; - - private final ByteBuffer buffer; - private final int forwardSize; +public final class ConnectionCosts extends org.apache.lucene.analysis.morph.ConnectionCosts { /** * Create a {@link ConnectionCosts} from an external resource path. @@ -53,24 +41,8 @@ private ConnectionCosts() throws IOException { } private ConnectionCosts(IOSupplier connectionCostResource) throws IOException { - try (InputStream is = new BufferedInputStream(connectionCostResource.get())) { - final DataInput in = new InputStreamDataInput(is); - CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); - this.forwardSize = in.readVInt(); - int backwardSize = in.readVInt(); - int size = forwardSize * backwardSize; - - // copy the matrix into a direct byte buffer - final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size * 2); - int accum = 0; - for (int j = 0; j < backwardSize; j++) { - for (int i = 0; i < forwardSize; i++) { - accum += in.readZInt(); - tmpBuffer.putShort((short) accum); - } - } - buffer = tmpBuffer.asReadOnlyBuffer(); - } + super( + connectionCostResource, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION); } private static InputStream getClassResource() throws IOException { @@ -79,12 +51,6 @@ private static InputStream getClassResource() throws IOException { ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath); } - public int get(int forwardId, int backwardId) { - // map 2d matrix into a single dimension short array - int offset = (backwardId * forwardSize + forwardId) * 2; - return buffer.getShort(offset); - } - public static ConnectionCosts getInstance() { return SingletonHolder.INSTANCE; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/DictionaryConstants.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/DictionaryConstants.java new file mode 100644 index 000000000000..4f8761e59e7e --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/DictionaryConstants.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +/** Dictionary constants */ +public final class DictionaryConstants { + /** Codec header of the dictionary file. */ + public static final String DICT_HEADER = "ko_dict"; + /** Codec header of the dictionary mapping file. */ + public static final String TARGETMAP_HEADER = "ko_dict_map"; + /** Codec header of the POS dictionary file. */ + public static final String POSDICT_HEADER = "ko_dict_pos"; + /** Codec header of the connection costs file. */ + public static final String CONN_COSTS_HEADER = "ko_cc"; + /** Codec header of the character definition file */ + public static final String CHARDEF_HEADER = "ko_cd"; + /** Codec version of the binary dictionary */ + public static final int VERSION = 1; +} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/Dictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java similarity index 52% rename from lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/Dictionary.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java index b25e1b5ff5ed..0887a7f0c428 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/Dictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java @@ -16,51 +16,47 @@ */ package org.apache.lucene.analysis.ko.dict; -import org.apache.lucene.analysis.ko.POS.Tag; -import org.apache.lucene.analysis.ko.POS.Type; +import org.apache.lucene.analysis.ko.POS; +import org.apache.lucene.analysis.morph.MorphData; -/** Dictionary interface for retrieving morphological data by id. */ -public interface Dictionary { +/** Represents Korean morphological information. */ +public interface KoMorphData extends MorphData { /** A morpheme extracted from a compound token. */ class Morpheme { - public final Tag posTag; + public final POS.Tag posTag; public final String surfaceForm; - public Morpheme(Tag posTag, String surfaceForm) { + public Morpheme(POS.Tag posTag, String surfaceForm) { this.posTag = posTag; this.surfaceForm = surfaceForm; } } - /** Get left id of specified word */ - int getLeftId(int wordId); - - /** Get right id of specified word */ - int getRightId(int wordId); - - /** Get word cost of specified word */ - int getWordCost(int wordId); - - /** Get the {@link Type} of specified word (morpheme, compound, inflect or pre-analysis) */ - Type getPOSType(int wordId); + /** + * Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound, + * inflect or pre-analysis) + */ + POS.Type getPOSType(int morphId); /** - * Get the left {@link Tag} of specfied word. + * Get the left {@link org.apache.lucene.analysis.ko.POS.Tag} of specfied word. * - *

For {@link Type#MORPHEME} and {@link Type#COMPOUND} the left and right POS are the same. + *

For {@link org.apache.lucene.analysis.ko.POS.Type#MORPHEME} and {@link + * org.apache.lucene.analysis.ko.POS.Type#COMPOUND} the left and right POS are the same. */ - Tag getLeftPOS(int wordId); + POS.Tag getLeftPOS(int morphId); /** - * Get the right {@link Tag} of specfied word. + * Get the right {@link org.apache.lucene.analysis.ko.POS.Tag} of specfied word. * - *

For {@link Type#MORPHEME} and {@link Type#COMPOUND} the left and right POS are the same. + *

For {@link org.apache.lucene.analysis.ko.POS.Type#MORPHEME} and {@link + * org.apache.lucene.analysis.ko.POS.Type#COMPOUND} the left and right POS are the same. */ - Tag getRightPOS(int wordId); + POS.Tag getRightPOS(int morphId); /** Get the reading of specified word (mainly used for Hanja to Hangul conversion). */ - String getReading(int wordId); + String getReading(int morphId); /** Get the morphemes of specified word (e.g. 가깝으나: 가깝 + 으나). */ - Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len); + Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 1f936b44724b..b8132cea61b5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.morph.BinaryDictionary; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOSupplier; @@ -32,11 +33,12 @@ * Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST * mapping to a list of wordIDs. */ -public final class TokenInfoDictionary extends BinaryDictionary { +public final class TokenInfoDictionary extends BinaryDictionary { public static final String FST_FILENAME_SUFFIX = "$fst.dat"; private final TokenInfoFST fst; + private final TokenInfoMorphData morphAtts; private TokenInfoDictionary() throws IOException { this( @@ -70,7 +72,13 @@ private TokenInfoDictionary( IOSupplier dictResource, IOSupplier fstResource) throws IOException { - super(targetMapResource, posResource, dictResource); + super( + targetMapResource, + dictResource, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new TokenInfoMorphData(buffer, posResource); FST fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { DataInput in = new InputStreamDataInput(is); @@ -93,6 +101,11 @@ public static TokenInfoDictionary getInstance() { return SingletonHolder.INSTANCE; } + @Override + public TokenInfoMorphData getMorphAttributes() { + return morphAtts; + } + private static class SingletonHolder { static final TokenInfoDictionary INSTANCE; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoMorphData.java new file mode 100644 index 000000000000..c8046b2e6041 --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoMorphData.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import org.apache.lucene.analysis.ko.POS; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; + +/** Morphological information for system dictionary. */ +public class TokenInfoMorphData implements KoMorphData { + + private final ByteBuffer buffer; + private final POS.Tag[] posDict; + + TokenInfoMorphData(ByteBuffer buffer, IOSupplier posResource) throws IOException { + this.buffer = buffer; + try (InputStream posIS = new BufferedInputStream(posResource.get())) { + DataInput in = new InputStreamDataInput(posIS); + CodecUtil.checkHeader( + in, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.VERSION, + DictionaryConstants.VERSION); + int posSize = in.readVInt(); + this.posDict = new POS.Tag[posSize]; + for (int j = 0; j < posSize; j++) { + posDict[j] = POS.resolveTag(in.readByte()); + } + } + } + + @Override + public int getLeftId(int morphId) { + return buffer.getShort(morphId) >>> 2; + } + + @Override + public int getRightId(int morphId) { + return buffer.getShort(morphId + 2) >>> 2; // Skip left id + } + + @Override + public int getWordCost(int morphId) { + return buffer.getShort(morphId + 4); // Skip left and right id + } + + @Override + public POS.Type getPOSType(int morphId) { + byte value = (byte) (buffer.getShort(morphId) & 3); + return POS.resolveType(value); + } + + @Override + public POS.Tag getLeftPOS(int morphId) { + return posDict[getLeftId(morphId)]; + } + + @Override + public POS.Tag getRightPOS(int morphId) { + POS.Type type = getPOSType(morphId); + if (type == POS.Type.MORPHEME || type == POS.Type.COMPOUND || hasSinglePOS(morphId)) { + return getLeftPOS(morphId); + } else { + byte value = buffer.get(morphId + 6); + return POS.resolveTag(value); + } + } + + @Override + public String getReading(int morphId) { + if (hasReadingData(morphId)) { + int offset = morphId + 6; + return readString(offset); + } + return null; + } + + @Override + public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len) { + POS.Type posType = getPOSType(morphId); + if (posType == POS.Type.MORPHEME) { + return null; + } + int offset = morphId + 6; + boolean hasSinglePos = hasSinglePOS(morphId); + if (hasSinglePos == false) { + offset++; // skip rightPOS + } + int length = buffer.get(offset++); + if (length == 0) { + return null; + } + Morpheme[] morphemes = new Morpheme[length]; + int surfaceOffset = 0; + final POS.Tag leftPOS = getLeftPOS(morphId); + for (int i = 0; i < length; i++) { + final String form; + final POS.Tag tag = hasSinglePos ? leftPOS : POS.resolveTag(buffer.get(offset++)); + if (posType == POS.Type.INFLECT) { + form = readString(offset); + offset += form.length() * 2 + 1; + } else { + int formLen = buffer.get(offset++); + form = new String(surfaceForm, off + surfaceOffset, formLen); + surfaceOffset += formLen; + } + morphemes[i] = new Morpheme(tag, form); + } + return morphemes; + } + + private String readString(int offset) { + int strOffset = offset; + int len = buffer.get(strOffset++); + char[] text = new char[len]; + for (int i = 0; i < len; i++) { + text[i] = buffer.getChar(strOffset + (i << 1)); + } + return new String(text); + } + + private boolean hasSinglePOS(int wordId) { + return (buffer.getShort(wordId + 2) & HAS_SINGLE_POS) != 0; + } + + private boolean hasReadingData(int wordId) { + return (buffer.getShort(wordId + 2) & HAS_READING) != 0; + } + + /** flag that the entry has a single part of speech (leftPOS) */ + public static final int HAS_SINGLE_POS = 1; + + /** flag that the entry has reading data. otherwise reading is surface form */ + public static final int HAS_READING = 2; +} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java index f2aa89ee8c54..4b45fd332585 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java @@ -20,11 +20,14 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.morph.BinaryDictionary; +import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; /** Dictionary for unknown-word handling. */ -public final class UnknownDictionary extends BinaryDictionary { +public final class UnknownDictionary extends BinaryDictionary { private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance(); + private final UnknownMorphData morphAtts; /** * Create a {@link UnknownDictionary} from an external resource path. @@ -35,19 +38,33 @@ public final class UnknownDictionary extends BinaryDictionary { * @throws IOException if resource was not found or broken */ public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException { - super( + this( () -> Files.newInputStream(targetMapFile), () -> Files.newInputStream(posDictFile), () -> Files.newInputStream(dictFile)); } private UnknownDictionary() throws IOException { - super( + this( () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), () -> getClassResource(POSDICT_FILENAME_SUFFIX), () -> getClassResource(DICT_FILENAME_SUFFIX)); } + private UnknownDictionary( + IOSupplier targetMapResource, + IOSupplier posResource, + IOSupplier dictResource) + throws IOException { + super( + targetMapResource, + dictResource, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new UnknownMorphData(buffer, posResource); + } + private static InputStream getClassResource(String suffix) throws IOException { final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix; return IOUtils.requireResourceNonNull( @@ -63,13 +80,8 @@ public static UnknownDictionary getInstance() { } @Override - public String getReading(int wordId) { - return null; - } - - @Override - public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { - return null; + public UnknownMorphData getMorphAttributes() { + return morphAtts; } private static class SingletonHolder { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownMorphData.java new file mode 100644 index 000000000000..dafb174f8d55 --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownMorphData.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import org.apache.lucene.util.IOSupplier; + +/** Morphological information for unk dictionary. */ +final class UnknownMorphData extends TokenInfoMorphData { + UnknownMorphData(ByteBuffer buffer, IOSupplier posResource) throws IOException { + super(buffer, posResource); + } + + @Override + public String getReading(int wordId) { + return null; + } + + @Override + public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { + return null; + } +} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index f8fcefdd295b..58a233112aa5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -22,7 +22,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; -import org.apache.lucene.analysis.ko.POS; +import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; @@ -32,15 +32,10 @@ * Class for building a User Dictionary. This class allows for adding custom nouns (세종) or compounds * (세종시 세종 시). */ -public final class UserDictionary implements Dictionary { +public final class UserDictionary implements Dictionary { // text -> wordID private final TokenInfoFST fst; - private static final int WORD_COST = -100000; - - // NNG left - private static final short LEFT_ID = 1781; - // NNG right private static final short RIGHT_ID = 3533; // NNG right with hangul and a coda on the last char @@ -48,9 +43,7 @@ public final class UserDictionary implements Dictionary { // NNG right with hangul and no coda on the last char private static final short RIGHT_ID_F = 3534; - // length, length... indexed by compound ID or null for simple noun - private final int[][] segmentations; - private final short[] rightIds; + private UserMorphData morphAtts; public static UserDictionary open(Reader reader) throws IOException { @@ -86,8 +79,8 @@ private UserDictionary(List entries) throws IOException { IntsRefBuilder scratch = new IntsRefBuilder(); String lastToken = null; - List segmentations = new ArrayList<>(entries.size()); - List rightIds = new ArrayList<>(entries.size()); + List _segmentations = new ArrayList<>(entries.size()); + List _rightIds = new ArrayList<>(entries.size()); long ord = 0; for (String entry : entries) { String[] splits = entry.split("\\s+"); @@ -98,16 +91,16 @@ private UserDictionary(List entries) throws IOException { char lastChar = entry.charAt(entry.length() - 1); if (charDef.isHangul(lastChar)) { if (charDef.hasCoda(lastChar)) { - rightIds.add(RIGHT_ID_T); + _rightIds.add(RIGHT_ID_T); } else { - rightIds.add(RIGHT_ID_F); + _rightIds.add(RIGHT_ID_F); } } else { - rightIds.add(RIGHT_ID); + _rightIds.add(RIGHT_ID); } if (splits.length == 1) { - segmentations.add(null); + _segmentations.add(null); } else { int[] length = new int[splits.length - 1]; int offset = 0; @@ -123,7 +116,7 @@ private UserDictionary(List entries) throws IOException { + token + ")"); } - segmentations.add(length); + _segmentations.add(length); } // add mapping to FST @@ -137,11 +130,12 @@ private UserDictionary(List entries) throws IOException { ord++; } this.fst = new TokenInfoFST(fstCompiler.compile()); - this.segmentations = segmentations.toArray(new int[segmentations.size()][]); - this.rightIds = new short[rightIds.size()]; - for (int i = 0; i < rightIds.size(); i++) { - this.rightIds[i] = rightIds.get(i); + int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]); + short[] rightIds = new short[_rightIds.size()]; + for (int i = 0; i < _rightIds.size(); i++) { + rightIds[i] = _rightIds.get(i); } + this.morphAtts = new UserMorphData(segmentations, rightIds); } public TokenInfoFST getFST() { @@ -149,57 +143,8 @@ public TokenInfoFST getFST() { } @Override - public int getLeftId(int wordId) { - return LEFT_ID; - } - - @Override - public int getRightId(int wordId) { - return rightIds[wordId]; - } - - @Override - public int getWordCost(int wordId) { - return WORD_COST; - } - - @Override - public POS.Type getPOSType(int wordId) { - if (segmentations[wordId] == null) { - return POS.Type.MORPHEME; - } else { - return POS.Type.COMPOUND; - } - } - - @Override - public POS.Tag getLeftPOS(int wordId) { - return POS.Tag.NNG; - } - - @Override - public POS.Tag getRightPOS(int wordId) { - return POS.Tag.NNG; - } - - @Override - public String getReading(int wordId) { - return null; - } - - @Override - public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { - int[] segs = segmentations[wordId]; - if (segs == null) { - return null; - } - int offset = 0; - Morpheme[] morphemes = new Morpheme[segs.length]; - for (int i = 0; i < segs.length; i++) { - morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i])); - offset += segs[i]; - } - return morphemes; + public UserMorphData getMorphAttributes() { + return morphAtts; } /** diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java new file mode 100644 index 000000000000..2056bd0829c9 --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import org.apache.lucene.analysis.ko.POS; + +/** Morphological information for user dictionary. */ +final class UserMorphData implements KoMorphData { + private static final int WORD_COST = -100000; + + // NNG left + private static final short LEFT_ID = 1781; + + // length, length... indexed by compound ID or null for simple noun + private final int[][] segmentations; + private final short[] rightIds; + + UserMorphData(int[][] segmentations, short[] rightIds) { + this.segmentations = segmentations; + this.rightIds = rightIds; + } + + @Override + public int getLeftId(int morphId) { + return LEFT_ID; + } + + @Override + public int getRightId(int morphId) { + return rightIds[morphId]; + } + + @Override + public int getWordCost(int morphId) { + return WORD_COST; + } + + @Override + public POS.Type getPOSType(int morphId) { + if (segmentations[morphId] == null) { + return POS.Type.MORPHEME; + } else { + return POS.Type.COMPOUND; + } + } + + @Override + public POS.Tag getLeftPOS(int morphId) { + return POS.Tag.NNG; + } + + @Override + public POS.Tag getRightPOS(int morphId) { + return POS.Tag.NNG; + } + + @Override + public String getReading(int morphId) { + return null; + } + + @Override + public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len) { + int[] segs = segmentations[morphId]; + if (segs == null) { + return null; + } + int offset = 0; + Morpheme[] morphemes = new Morpheme[segs.length]; + for (int i = 0; i < segs.length; i++) { + morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i])); + offset += segs[i]; + } + return morphemes; + } +} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttribute.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttribute.java index c9fb33b17c2d..1e28155e667e 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttribute.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttribute.java @@ -19,7 +19,7 @@ import org.apache.lucene.analysis.ko.POS.Tag; import org.apache.lucene.analysis.ko.POS.Type; import org.apache.lucene.analysis.ko.Token; -import org.apache.lucene.analysis.ko.dict.Dictionary.Morpheme; +import org.apache.lucene.analysis.ko.dict.KoMorphData; import org.apache.lucene.util.Attribute; /** @@ -37,8 +37,11 @@ public interface PartOfSpeechAttribute extends Attribute { /** Get the right part of speech of the token. */ Tag getRightPOS(); - /** Get the {@link Morpheme} decomposition of the token. */ - Morpheme[] getMorphemes(); + /** + * Get the {@link org.apache.lucene.analysis.ko.dict.KoMorphData.Morpheme} decomposition of the + * token. + */ + KoMorphData.Morpheme[] getMorphemes(); /** Set the current token. */ void setToken(Token token); diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java index a1b04cb7f057..a59999db557f 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java @@ -19,7 +19,7 @@ import org.apache.lucene.analysis.ko.POS.Tag; import org.apache.lucene.analysis.ko.POS.Type; import org.apache.lucene.analysis.ko.Token; -import org.apache.lucene.analysis.ko.dict.Dictionary.Morpheme; +import org.apache.lucene.analysis.ko.dict.KoMorphData; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeReflector; @@ -47,7 +47,7 @@ public Tag getRightPOS() { } @Override - public Morpheme[] getMorphemes() { + public KoMorphData.Morpheme[] getMorphemes() { return token == null ? null : token.getMorphemes(); } @@ -76,12 +76,12 @@ public void reflectWith(AttributeReflector reflector) { reflector.reflect(PartOfSpeechAttribute.class, "morphemes", displayMorphemes(getMorphemes())); } - private String displayMorphemes(Morpheme[] morphemes) { + private String displayMorphemes(KoMorphData.Morpheme[] morphemes) { if (morphemes == null) { return null; } StringBuilder builder = new StringBuilder(); - for (Morpheme morpheme : morphemes) { + for (KoMorphData.Morpheme morpheme : morphemes) { if (builder.length() > 0) { builder.append("+"); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java index 4a6fd6d353c7..31505fb45c8b 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java @@ -22,12 +22,14 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.lucene.analysis.ko.dict.ConnectionCosts; +import org.apache.lucene.analysis.morph.ConnectionCostsWriter; class ConnectionCostsBuilder { private ConnectionCostsBuilder() {} - public static ConnectionCostsWriter build(Path path) throws IOException { + public static ConnectionCostsWriter build(Path path) throws IOException { try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII); LineNumberReader lineReader = new LineNumberReader(reader)) { @@ -41,7 +43,8 @@ public static ConnectionCostsWriter build(Path path) throws IOException { assert forwardSize > 0 && backwardSize > 0; - ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize); + ConnectionCostsWriter costs = + new ConnectionCostsWriter<>(ConnectionCosts.class, forwardSize, backwardSize); while ((line = lineReader.readLine()) != null) { String[] fields = line.split("\\s+"); diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java deleted file mode 100644 index 7265ef84dbba..000000000000 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.util; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import org.apache.lucene.analysis.ko.dict.ConnectionCosts; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.OutputStreamDataOutput; - -final class ConnectionCostsWriter { - - private final ByteBuffer - costs; // array is backward IDs first since get is called using the same backward ID - // consecutively. maybe doesn't matter. - private final int forwardSize; - private final int backwardSize; - /** Constructor for building. TODO: remove write access */ - ConnectionCostsWriter(int forwardSize, int backwardSize) { - this.forwardSize = forwardSize; - this.backwardSize = backwardSize; - this.costs = ByteBuffer.allocateDirect(2 * backwardSize * forwardSize); - } - - public void add(int forwardId, int backwardId, int cost) { - int offset = (backwardId * forwardSize + forwardId) * 2; - costs.putShort(offset, (short) cost); - } - - public void write(Path baseDir) throws IOException { - Files.createDirectories(baseDir); - String fileName = - ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX; - try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName)); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION); - out.writeVInt(forwardSize); - out.writeVInt(backwardSize); - int last = 0; - for (int i = 0; i < costs.limit() / 2; i++) { - short cost = costs.getShort(i * 2); - int delta = (int) cost - last; - out.writeZInt(delta); - last = cost; - } - } - } -} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java index e4c3b20c9b6b..0e8f11f0cf8e 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; +import org.apache.lucene.analysis.ko.dict.DictionaryConstants; /** Tool to build dictionaries. */ public class DictionaryBuilder { @@ -34,7 +35,8 @@ public static void build(Path inputDir, Path outputDir, String encoding, boolean new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir); // Build Connection Cost - ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")).write(outputDir); + ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")) + .write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION); } public static void main(String[] args) throws IOException { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryEntryWriter.java similarity index 50% rename from lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryEntryWriter.java index 37ed157f8c8a..e18cae357d34 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryEntryWriter.java @@ -16,38 +16,25 @@ */ package org.apache.lucene.analysis.ko.util; -import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.WritableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.ko.POS; -import org.apache.lucene.analysis.ko.dict.BinaryDictionary; -import org.apache.lucene.analysis.ko.dict.Dictionary; -import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.analysis.ko.dict.KoMorphData; +import org.apache.lucene.analysis.ko.dict.TokenInfoMorphData; +import org.apache.lucene.analysis.morph.DictionaryEntryWriter; import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; -abstract class BinaryDictionaryWriter { +/** Writes system dictionary entries. */ +class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter { private static final int ID_LIMIT = 8192; - private final Class implClazz; - protected ByteBuffer buffer; - private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; - private int[] targetMap = new int[8192]; - private int[] targetMapOffsets = new int[8192]; - private final ArrayList posDict = new ArrayList<>(); - - BinaryDictionaryWriter(Class implClazz, int size) { - this.implClazz = implClazz; - buffer = ByteBuffer.allocateDirect(size); + TokenInfoDictionaryEntryWriter(int size) { + super(size); } /** @@ -72,7 +59,8 @@ abstract class BinaryDictionaryWriter { * * @return current position of buffer, which will be wordId of next entry */ - public int put(String[] entry) { + @Override + protected int putEntry(String[] entry) { short leftId = Short.parseShort(entry[1]); short rightId = Short.parseShort(entry[2]); short wordCost = Short.parseShort(entry[3]); @@ -113,7 +101,7 @@ public int put(String[] entry) { assert existing == null || existing.equals(fullPOSData); posDict.set(leftId, fullPOSData); - final List morphemes = new ArrayList<>(); + final List morphemes = new ArrayList<>(); // true if the POS and decompounds of the token are all the same. boolean hasSinglePOS = (leftPOS == rightPOS); if (posType != POS.Type.MORPHEME && expression.length() > 0) { @@ -124,7 +112,7 @@ public int put(String[] entry) { String surfaceForm = tokenSplit[0].trim(); if (surfaceForm.isEmpty() == false) { POS.Tag exprTag = POS.resolveTag(tokenSplit[1]); - morphemes.add(new Dictionary.Morpheme(exprTag, tokenSplit[0])); + morphemes.add(new KoMorphData.Morpheme(exprTag, tokenSplit[0])); if (leftPOS != exprTag) { hasSinglePOS = false; } @@ -134,10 +122,10 @@ public int put(String[] entry) { int flags = 0; if (hasSinglePOS) { - flags |= BinaryDictionary.HAS_SINGLE_POS; + flags |= TokenInfoMorphData.HAS_SINGLE_POS; } if (posType == POS.Type.MORPHEME && reading.length() > 0) { - flags |= BinaryDictionary.HAS_READING; + flags |= TokenInfoMorphData.HAS_READING; } if (leftId >= ID_LIMIT) { @@ -161,7 +149,7 @@ public int put(String[] entry) { } buffer.put((byte) morphemes.size()); int compoundOffset = 0; - for (Dictionary.Morpheme morpheme : morphemes) { + for (KoMorphData.Morpheme morpheme : morphemes) { if (hasSinglePOS == false) { buffer.put((byte) morpheme.posTag.ordinal()); } @@ -184,112 +172,20 @@ private void writeString(String s) { } } - void addMapping(int sourceId, int wordId) { - if (wordId <= lastWordId) { - throw new IllegalStateException( - "words out of order: " + wordId + " vs lastID: " + lastWordId); - } - - if (sourceId > lastSourceId) { - targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1); - for (int i = lastSourceId + 1; i <= sourceId; i++) { - targetMapOffsets[i] = targetMapEndOffset; - } - } else if (sourceId != lastSourceId) { - throw new IllegalStateException( - "source ids not in increasing order: lastSourceId=" - + lastSourceId - + " vs sourceId=" - + sourceId); - } - - targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1); - targetMap[targetMapEndOffset] = wordId; - targetMapEndOffset++; - - lastSourceId = sourceId; - lastWordId = wordId; - } - - final String getBaseFileName() { - return implClazz.getName().replace('.', '/'); - } - - /** - * Write dictionary in file - * - * @throws IOException if an I/O error occurs writing the dictionary files - */ - public void write(Path baseDir) throws IOException { - final String baseName = getBaseFileName(); - writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX)); - writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX)); - writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX)); - } - - private void writeTargetMap(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION); - - final int numSourceIds = lastSourceId + 1; - out.writeVInt(targetMapEndOffset); // <-- size of main array - out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry) - int prev = 0, sourceId = 0; - for (int ofs = 0; ofs < targetMapEndOffset; ofs++) { - final int val = targetMap[ofs], delta = val - prev; - assert delta >= 0; - if (ofs == targetMapOffsets[sourceId]) { - out.writeVInt((delta << 1) | 0x01); - sourceId++; - } else { - out.writeVInt((delta << 1)); - } - prev += delta; - } - if (sourceId != numSourceIds) { - throw new IllegalStateException( - "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); - } - } - } - - private void writePosDict(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION); - out.writeVInt(posDict.size()); - for (String s : posDict) { - if (s == null) { - out.writeByte((byte) POS.Tag.UNKNOWN.ordinal()); - } else { - String[] data = CSVUtil.parse(s); - if (data.length != 2) { - throw new IllegalArgumentException( - "Malformed pos/inflection: " + s + "; expected 2 characters"); - } - out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal()); + @Override + protected void writePosDict(OutputStream bos, DataOutput out) throws IOException { + out.writeVInt(posDict.size()); + for (String s : posDict) { + if (s == null) { + out.writeByte((byte) POS.Tag.UNKNOWN.ordinal()); + } else { + String[] data = CSVUtil.parse(s); + if (data.length != 2) { + throw new IllegalArgumentException( + "Malformed pos/inflection: " + s + "; expected 2 characters"); } + out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal()); } } } - - private void writeDictionary(Path path) throws IOException { - Files.createDirectories(path.getParent()); - try (OutputStream os = Files.newOutputStream(path); - OutputStream bos = new BufferedOutputStream(os)) { - final DataOutput out = new OutputStreamDataOutput(bos); - CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION); - out.writeVInt(buffer.position()); - final WritableByteChannel channel = Channels.newChannel(bos); - // Write Buffer - buffer.flip(); // set position to 0, set limit to current position - channel.write(buffer); - assert buffer.remaining() == 0L; - } - } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java index 316e1b6232ab..78a9acaff8a3 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java @@ -20,14 +20,16 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Objects; +import org.apache.lucene.analysis.ko.dict.DictionaryConstants; import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary; +import org.apache.lucene.analysis.morph.BinaryDictionaryWriter; import org.apache.lucene.util.fst.FST; -class TokenInfoDictionaryWriter extends BinaryDictionaryWriter { +class TokenInfoDictionaryWriter extends BinaryDictionaryWriter { private FST fst; TokenInfoDictionaryWriter(int size) { - super(TokenInfoDictionary.class, size); + super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size)); } public void setFST(FST fst) { @@ -35,9 +37,19 @@ public void setFST(FST fst) { this.fst = fst; } + @Override + protected void addMapping(int sourceId, int wordId) { + super.addMapping(sourceId, wordId); + } + @Override public void write(Path baseDir) throws IOException { - super.write(baseDir); + super.write( + baseDir, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX)); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java index 97b525d60f0a..82285e1c78b3 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java @@ -19,20 +19,29 @@ import java.io.IOException; import java.nio.file.Path; import org.apache.lucene.analysis.ko.dict.CharacterDefinition; +import org.apache.lucene.analysis.ko.dict.DictionaryConstants; import org.apache.lucene.analysis.ko.dict.UnknownDictionary; +import org.apache.lucene.analysis.morph.BinaryDictionaryWriter; +import org.apache.lucene.analysis.morph.CharacterDefinitionWriter; -class UnknownDictionaryWriter extends BinaryDictionaryWriter { +class UnknownDictionaryWriter extends BinaryDictionaryWriter { - private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter(); + private final org.apache.lucene.analysis.morph.CharacterDefinitionWriter + characterDefinition = + new CharacterDefinitionWriter<>( + CharacterDefinition.class, + CharacterDefinition.DEFAULT, + CharacterDefinition.CLASS_COUNT, + CharacterDefinition::lookupCharacterClass); public UnknownDictionaryWriter(int size) { - super(UnknownDictionary.class, size); + super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size)); } @Override public int put(String[] entry) { // Get wordId of current entry - int wordId = buffer.position(); + int wordId = entryWriter.currentPosition(); // Put entry int result = super.put(entry); @@ -59,7 +68,13 @@ public void putInvokeDefinition(String characterClassName, int invoke, int group @Override public void write(Path baseDir) throws IOException { - super.write(baseDir); - characterDefinition.write(baseDir); + super.write( + baseDir, + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.POSDICT_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + characterDefinition.write( + baseDir, DictionaryConstants.CHARDEF_HEADER, DictionaryConstants.VERSION); } } diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java index 5f8edab8934b..2ba7cee6a37c 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java @@ -16,10 +16,10 @@ */ package org.apache.lucene.analysis.ko.dict; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import java.io.BufferedWriter; import java.nio.charset.StandardCharsets; diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java index 39fc55065d96..6948c4aa851e 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java @@ -16,10 +16,10 @@ */ package org.apache.lucene.analysis.ko.dict; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -141,13 +141,13 @@ public void testEnumerateAll() throws Exception { tid.getWordCost(wordId); - POS.Type type = tid.getPOSType(wordId); - POS.Tag leftPOS = tid.getLeftPOS(wordId); - POS.Tag rightPOS = tid.getRightPOS(wordId); + POS.Type type = tid.getMorphAttributes().getPOSType(wordId); + POS.Tag leftPOS = tid.getMorphAttributes().getLeftPOS(wordId); + POS.Tag rightPOS = tid.getMorphAttributes().getRightPOS(wordId); if (type == POS.Type.MORPHEME) { assertSame(leftPOS, rightPOS); - String reading = tid.getReading(wordId); + String reading = tid.getMorphAttributes().getReading(wordId); boolean isHanja = charDef.isHanja(surfaceForm.charAt(0)); if (isHanja) { assertNotNull(reading); @@ -163,10 +163,11 @@ public void testEnumerateAll() throws Exception { assertSame(leftPOS, rightPOS); assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP); } - Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length); + KoMorphData.Morpheme[] decompound = + tid.getMorphAttributes().getMorphemes(wordId, chars, 0, chars.length); if (decompound != null) { int offset = 0; - for (Dictionary.Morpheme morph : decompound) { + for (KoMorphData.Morpheme morph : decompound) { assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm)); assertFalse(morph.surfaceForm.isEmpty()); assertEquals(morph.surfaceForm.trim(), morph.surfaceForm); diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java index 7f97d0520e8e..8fb2827a336b 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java @@ -31,16 +31,16 @@ public void testLookup() throws IOException { char[] sArray = s.toCharArray(); List wordIds = dictionary.lookup(sArray, 0, s.length()); assertEquals(1, wordIds.size()); - assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length())); + assertNull(dictionary.getMorphAttributes().getMorphemes(wordIds.get(0), sArray, 0, s.length())); s = "세종시"; sArray = s.toCharArray(); wordIds = dictionary.lookup(sArray, 0, s.length()); assertEquals(2, wordIds.size()); - assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length())); + assertNull(dictionary.getMorphAttributes().getMorphemes(wordIds.get(0), sArray, 0, s.length())); - Dictionary.Morpheme[] decompound = - dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length()); + KoMorphData.Morpheme[] decompound = + dictionary.getMorphAttributes().getMorphemes(wordIds.get(1), sArray, 0, s.length()); assertNotNull(decompound); assertEquals(2, decompound.length); assertEquals(decompound[0].posTag, POS.Tag.NNG); @@ -52,7 +52,7 @@ public void testLookup() throws IOException { sArray = s.toCharArray(); wordIds = dictionary.lookup(sArray, 0, s.length()); assertEquals(1, wordIds.size()); - assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length())); + assertNull(dictionary.getMorphAttributes().getMorphemes(wordIds.get(0), sArray, 0, s.length())); } @Test