Skip to content

Commit

Permalink
LUCENE-10558: Implement URL ctor to support classpath/module usage in…
Browse files Browse the repository at this point in the history
… Kuromoji and Nori dictionaries (#868)
  • Loading branch information
uschindler authored May 6, 2022
1 parent ad9f968 commit 749bfb8
Show file tree
Hide file tree
Showing 12 changed files with 289 additions and 72 deletions.
9 changes: 9 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ API Changes
taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older
checkpoints. (Gautam Worah)

* LUCENE-10558: Add new constructors to Kuromoji and Nori dictionary classes to support classpath /
module system usage. It is now possible to use JDK's Class/ClassLoader/Module#getResource(...) apis
and pass their returned URL to dictionary constructors to load resources from Classpath or Module
resources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)

New Features
---------------------

Expand Down Expand Up @@ -110,6 +115,10 @@ Bug Fixes

* LUCENE-10518: Relax field consistency check for old indices (Nhat Nguyen)

* LUCENE-10558: Restore behaviour of deprecated Kuromoji and Nori dictionary constructors for
custom dictionary support. Please also use new URL-based constructors for classpath/module
system ressources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)

Build
---------------------

Expand Down
13 changes: 13 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@ the [Log4j JDK Logging Adapter](https://logging.apache.org/log4j/2.x/log4j-jul/i
in combination with the corresponding system property:
`java.util.logging.manager=org.apache.logging.log4j.jul.LogManager`.

### Kuromoji and Nori analysis component constructors for custom dictionaries

The Kuromoji and Nori analysis modules had some way to customize the backing dictionaries
by passing a path to file or classpath resources using some inconsistently implemented
APIs. This was buggy from the beginning, but some users made use of it. Due to move to Java
module system, especially the resource lookup on classpath stopped to work correctly.
The Lucene team therefore implemented new APIs to create dictionary implementations
with custom data files. Unfortunately there were some shortcomings in the 9.1 version,
also when using the now deprecated ctors, so users are advised to upgrade to
Lucene 9.2 or stay with 9.0.

See LUCENE-10558 for more details and workarounds.

## Migration from Lucene 8.x to Lucene 9.0

### Rename of binary artifacts from '**-analyzers-**' to '**-analysis-**' (LUCENE-9562)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Objects;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
Expand Down Expand Up @@ -140,19 +141,18 @@ public static final InputStream getResource(ResourceScheme scheme, String path)
throws IOException {
switch (scheme) {
case CLASSPATH:
return getClassResource(path);
Objects.requireNonNull(
path,
"Deprecated API no longer works with null paths, to load default resources use default ctors.");
return IOUtils.requireResourceNonNull(
BinaryDictionary.class.getClassLoader().getResourceAsStream(path), path);
case FILE:
return Files.newInputStream(Paths.get(path));
default:
throw new IllegalStateException("unknown resource scheme " + scheme);
}
}

@Deprecated(forRemoval = true, since = "9.1")
private static InputStream getClassResource(String path) throws IOException {
return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
}

public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
Expand All @@ -42,15 +42,13 @@ public final class ConnectionCosts {
/**
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
* @param path - where to load resources from, without the ".dat" suffix
* @deprecated replaced by {@link #ConnectionCosts(Path)}
* @deprecated replaced by {@link #ConnectionCosts(Path)} for files and {@link
* #ConnectionCosts(URL)} for classpath/module resources.
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
this(
scheme == BinaryDictionary.ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + FILENAME_SUFFIX))
: ConnectionCosts::getClassResource);
this(() -> BinaryDictionary.getResource(scheme, path.replace('.', '/') + FILENAME_SUFFIX));
}

/**
Expand All @@ -63,6 +61,17 @@ public ConnectionCosts(Path connectionCostsFile) throws IOException {
this(() -> Files.newInputStream(connectionCostsFile));
}

/**
* Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link
* ClassLoader#getResource(String)}).
*
* @param connectionCostsUrl where to load connection costs resource
* @throws IOException if resource was not found or broken
*/
public ConnectionCosts(URL connectionCostsUrl) throws IOException {
this(() -> connectionCostsUrl.openStream());
}

private ConnectionCosts() throws IOException {
this(ConnectionCosts::getClassResource);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
Expand All @@ -41,27 +41,20 @@ public final class TokenInfoDictionary extends BinaryDictionary {

/**
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
* scheme only, use this class's name as the path.
* @deprecated replaced by {@link #TokenInfoDictionary(Path, Path, Path, Path)}
* @param resourcePath - where to load resources (dictionaries) from.
* @deprecated replaced by {@link #TokenInfoDictionary(Path, Path, Path, Path)} for files and
* {@link #TokenInfoDictionary(URL, URL, URL, URL)} for classpath/module resources
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
throws IOException {
this(
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
: () -> getClassResource(DICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
: () -> getClassResource(FST_FILENAME_SUFFIX));
() ->
BinaryDictionary.getResource(resourceScheme, resourcePath + TARGETMAP_FILENAME_SUFFIX),
() -> BinaryDictionary.getResource(resourceScheme, resourcePath + POSDICT_FILENAME_SUFFIX),
() -> BinaryDictionary.getResource(resourceScheme, resourcePath + DICT_FILENAME_SUFFIX),
() -> BinaryDictionary.getResource(resourceScheme, resourcePath + FST_FILENAME_SUFFIX));
}

/**
Expand All @@ -82,6 +75,25 @@ public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile,
() -> Files.newInputStream(fstFile));
}

/**
* Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @param fstUrl where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
throws IOException {
this(
() -> targetMapUrl.openStream(),
() -> posDictUrl.openStream(),
() -> dictUrl.openStream(),
() -> fstUrl.openStream());
}

private TokenInfoDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.util.IOUtils;

/** Dictionary for unknown-word handling. */
Expand All @@ -32,21 +32,16 @@ public final class UnknownDictionary extends BinaryDictionary {
* @param scheme scheme for loading resources (FILE or CLASSPATH).
* @param path where to load resources from; a path, including the file base name without
* extension; this is used to match multiple files with the same base name.
* @deprecated replaced by {@link #UnknownDictionary(Path, Path, Path)}
* @deprecated replaced by {@link #UnknownDictionary(Path, Path, Path)} for files and {@link
* #UnknownDictionary(URL, URL, URL)} for classpath/module resources
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
super(
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + TARGETMAP_FILENAME_SUFFIX))
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + POSDICT_FILENAME_SUFFIX))
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + DICT_FILENAME_SUFFIX))
: () -> getClassResource(DICT_FILENAME_SUFFIX));
() -> BinaryDictionary.getResource(scheme, path + TARGETMAP_FILENAME_SUFFIX),
() -> BinaryDictionary.getResource(scheme, path + POSDICT_FILENAME_SUFFIX),
() -> BinaryDictionary.getResource(scheme, path + DICT_FILENAME_SUFFIX));
}

/**
Expand All @@ -64,6 +59,20 @@ public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) th
() -> Files.newInputStream(dictFile));
}

/**
* Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @throws IOException if resource was not found or broken
*/
public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException {
super(
() -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream());
}

private UnknownDictionary() throws IOException {
super(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
public class TestExternalDictionary extends LuceneTestCase {

private Path dir;
private ClassLoader loader = getClass().getClassLoader();

@Override
@Before
Expand Down Expand Up @@ -98,4 +99,59 @@ public void testLoadExternalConnectionCosts() throws Exception {
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(1, cc.get(0, 1));
}

public void testLoadExternalUrlTokenInfoDictionary() throws Exception {
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
TokenInfoDictionary dict =
new TokenInfoDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX));
assertNotNull(dict.getFST());
}

public void testLoadExternalUrlUnknownDictionary() throws Exception {
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
UnknownDictionary dict =
new UnknownDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX));
assertNotNull(dict.getCharacterDefinition());
}

public void testLoadExternalUrlConnectionCosts() throws Exception {
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
ConnectionCosts cc =
new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(1, cc.get(0, 1));
}

@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public void testDeprecatedLoadExternalTokenInfoDictionary() throws Exception {
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
TokenInfoDictionary dict =
new TokenInfoDictionary(BinaryDictionary.ResourceScheme.CLASSPATH, dictionaryPath);
assertNotNull(dict.getFST());
}

@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public void testDeprecatedLoadExternalUnknownDictionary() throws Exception {
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
UnknownDictionary dict =
new UnknownDictionary(BinaryDictionary.ResourceScheme.CLASSPATH, dictionaryPath);
assertNotNull(dict.getCharacterDefinition());
}

@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public void testDeprecatedLoadExternalConnectionCosts() throws Exception {
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
ConnectionCosts cc =
new ConnectionCosts(BinaryDictionary.ResourceScheme.CLASSPATH, dictionaryPath);
assertEquals(1, cc.get(0, 1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Objects;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;

/** Base class for a binary-encoded in-memory dictionary. */
Expand Down Expand Up @@ -114,6 +118,23 @@ private static void populateTargetMap(DataInput in, int[] targetMap, int[] targe
targetMapOffsets[sourceId] = targetMap.length;
}

@Deprecated(forRemoval = true, since = "9.1")
public static final InputStream getResource(ResourceScheme scheme, String path)
throws IOException {
switch (scheme) {
case CLASSPATH:
Objects.requireNonNull(
path,
"Deprecated API no longer works with null paths, to load default resources use default ctors.");
return IOUtils.requireResourceNonNull(
BinaryDictionary.class.getClassLoader().getResourceAsStream(path), path);
case FILE:
return Files.newInputStream(Paths.get(path));
default:
throw new IllegalStateException("unknown resource scheme " + scheme);
}
}

public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
Expand Down
Loading

0 comments on commit 749bfb8

Please sign in to comment.