diff --git a/src/main/java/io/anserini/analysis/AnalyzerMap.java b/src/main/java/io/anserini/analysis/AnalyzerMap.java index f1ffa3f176..e6a889c3bf 100644 --- a/src/main/java/io/anserini/analysis/AnalyzerMap.java +++ b/src/main/java/io/anserini/analysis/AnalyzerMap.java @@ -16,12 +16,18 @@ package io.anserini.analysis; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; + +import java.lang.reflect.InvocationTargetException; import java.util.HashMap; import java.util.Map; public class AnalyzerMap { - public static final Map analyzerMap = new HashMap() { + private static final Logger LOG = LogManager.getLogger(AnalyzerMap.class); + + public static final Map analyzerMap = new HashMap<>() { { put("ar", "org.apache.lucene.analysis.ar.ArabicAnalyzer"); put("bn", "org.apache.lucene.analysis.bn.BengaliAnalyzer"); @@ -51,8 +57,24 @@ public class AnalyzerMap { } }; - public static Analyzer getLanguageSpecificAnalyzer(String language) throws Exception { - String analyzerClazz = analyzerMap.get(language); - return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance(); + public static Analyzer getLanguageSpecificAnalyzer(String language) { + String analyzerClazz = analyzerMap.get(language); + + try { + return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance(); + } catch (InstantiationException e) { + LOG.error(e); + } catch (IllegalAccessException e) { + LOG.error(e); + } catch (InvocationTargetException e) { + LOG.error(e); + } catch (NoSuchMethodException e) { + LOG.error(e); + } catch (ClassNotFoundException e) { + LOG.error(e); + } + + // If we have any issues, eat the exception and return null. + return null; } } diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index fddc75bc2d..13b14cb3db 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -105,17 +105,12 @@ public static class Args { usage = "Location of input collection.") public String input; - @Option(name = "-threads", metaVar = "[num]", required = true, - usage = "Number of indexing threads.") - public int threads; - @Option(name = "-collection", metaVar = "[class]", required = true, usage = "Collection class in package 'io.anserini.collection'.") public String collectionClass; - @Option(name = "-generator", metaVar = "[class]", - usage = "Document generator class in package 'io.anserini.index.generator'.") - public String generatorClass = "DefaultLuceneDocumentGenerator"; + @Option(name = "-index", metaVar = "[path]", usage = "Index path.", required = true) + public String index; // optional general arguments @@ -129,8 +124,15 @@ public static class Args { // optional arguments - @Option(name = "-index", metaVar = "[path]", usage = "Index path.") - public String index; + @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") + public int threads = 8; + + @Option(name = "-append", usage = "Append documents.") + public boolean append = false; + + @Option(name = "-generator", metaVar = "[class]", + usage = "Document generator class in package 'io.anserini.index.generator'.") + public String generatorClass = "DefaultLuceneDocumentGenerator"; @Option(name = "-fields", handler = StringArrayOptionHandler.class, usage = "List of fields to index (space separated), in addition to the default 'contents' field.") diff --git a/src/main/java/io/anserini/index/SimpleIndexer.java b/src/main/java/io/anserini/index/SimpleIndexer.java index 2840e3d48e..e785559070 100644 --- a/src/main/java/io/anserini/index/SimpleIndexer.java +++ b/src/main/java/io/anserini/index/SimpleIndexer.java @@ -16,20 +16,19 @@ package io.anserini.index; -import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.analysis.AnalyzerMap; +import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.analysis.HuggingFaceTokenizerAnalyzer; -import io.anserini.index.IndexCollection.Args; -import io.anserini.index.generator.DefaultLuceneDocumentGenerator; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import io.anserini.collection.FileSegment; import io.anserini.collection.JsonCollection; +import io.anserini.index.IndexCollection.Args; import io.anserini.index.generator.GeneratorException; import io.anserini.index.generator.LuceneDocumentGenerator; import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -54,60 +53,44 @@ public class SimpleIndexer { private final Analyzer analyzer; private final LuceneDocumentGenerator generator; - public SimpleIndexer(String indexPath) throws IOException { - this.indexPath = Paths.get(indexPath); - if (!Files.exists(this.indexPath)) { - Files.createDirectories(this.indexPath); - } - - analyzer = DefaultEnglishAnalyzer.newDefaultInstance(); - generator = new DefaultLuceneDocumentGenerator(); - final Directory dir = FSDirectory.open(this.indexPath); - final IndexWriterConfig config = new IndexWriterConfig(analyzer); - - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(2048); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); + private static Args parseArgs(String[] argv) throws CmdLineException { + Args args = new Args(); + CmdLineParser parser = new CmdLineParser(args); + parser.parseArgument(argv); - writer = new IndexWriter(dir, config); + return args; } public SimpleIndexer(String[] argv) throws Exception { this(parseArgs(argv)); } - private static Args parseArgs(String[] argv) throws CmdLineException { - Args args = new Args(); - CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100)); + public SimpleIndexer(String indexPath) throws Exception { + this(new String[] { + "-input", "", + "-index", indexPath, + "-collection", "JsonCollection"}); + } - try { - parser.parseArgument(argv); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: " + SimpleIndexer.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); - throw e; - } - return args; + public SimpleIndexer(String indexPath, boolean append) throws Exception { + // First line of constructor must be "this", which leads to a slightly awkward implementation. + this(append ? new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection", "-append"} : + new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection"}); } - @SuppressWarnings("unchecked") public SimpleIndexer(Args args) throws Exception { this.indexPath = Paths.get(args.index); if (!Files.exists(this.indexPath)) { Files.createDirectories(this.indexPath); } Class generatorClass = Class.forName("io.anserini.index.generator." + args.generatorClass); - generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(Args.class).newInstance(args); + generator = (LuceneDocumentGenerator) generatorClass.getDeclaredConstructor(Args.class).newInstance(args); analyzer = getAnalyzer(args); final Directory dir = FSDirectory.open(this.indexPath); final IndexWriterConfig config = new IndexWriterConfig(analyzer); - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setOpenMode(args.append ? IndexWriterConfig.OpenMode.CREATE_OR_APPEND : IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(2048); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); @@ -115,23 +98,28 @@ public SimpleIndexer(Args args) throws Exception { writer = new IndexWriter(dir, config); } - private Analyzer getAnalyzer(Args args) throws Exception { - if (args.analyzeWithHuggingFaceTokenizer != null){ - LOG.info("Bert Tokenizer"); - return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer); - } else if (AnalyzerMap.analyzerMap.containsKey(args.language)){ - LOG.info("Language: " + args.language); - return AnalyzerMap.getLanguageSpecificAnalyzer(args.language); - } else if (args.pretokenized || args.language.equals("sw")) { - LOG.info("Pretokenized"); - return new WhitespaceAnalyzer(); - } else { - // Default to English - LOG.info("Language: en"); - LOG.info("Stemmer: " + args.stemmer); - LOG.info("Keep stopwords? " + args.keepStopwords); - LOG.info("Stopwords file: " + args.stopwords); - return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords); + private Analyzer getAnalyzer(Args args) { + try { + if (args.analyzeWithHuggingFaceTokenizer != null) { + LOG.info("Using HuggingFaceTokenizerAnalyzer"); + return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer); + } else if (AnalyzerMap.analyzerMap.containsKey(args.language)) { + LOG.info("Using language-specific analyzer"); + LOG.info("Language: " + args.language); + return AnalyzerMap.getLanguageSpecificAnalyzer(args.language); + } else if (args.pretokenized || args.language.equals("sw")) { + LOG.info("Using WhitespaceAnalyzer"); + return new WhitespaceAnalyzer(); + } else { + // Default to English + LOG.info("Using DefaultEnglishAnalyzer"); + LOG.info("Stemmer: " + args.stemmer); + LOG.info("Keep stopwords? " + args.keepStopwords); + LOG.info("Stopwords file: " + args.stopwords); + return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords); + } + } catch (Exception e) { + return null; } } @@ -141,10 +129,10 @@ public boolean addDocument(String raw) { JsonCollection.Document doc = JsonCollection.Document.fromString(raw); writer.addDocument(generator.createDocument(doc)); } catch (GeneratorException e) { - e.printStackTrace(); + LOG.error(e); return false; } catch (IOException e) { - e.printStackTrace(); + LOG.error(e); return false; } diff --git a/src/test/java/io/anserini/index/SimpleIndexerAppendTest.java b/src/test/java/io/anserini/index/SimpleIndexerAppendTest.java new file mode 100644 index 0000000000..0823b656f9 --- /dev/null +++ b/src/test/java/io/anserini/index/SimpleIndexerAppendTest.java @@ -0,0 +1,254 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index; + +import io.anserini.collection.FileSegment; +import io.anserini.collection.JsonCollection; +import io.anserini.search.SimpleSearcher; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class SimpleIndexerAppendTest extends LuceneTestCase { + + private class JsonCollectionWrapper { + JsonCollection collection; + + public JsonCollectionWrapper(String collectionPath) { + collection = new JsonCollection(Paths.get(collectionPath)); + } + + public int indexWith(SimpleIndexer indexer) { + int cnt = 0; + for (FileSegment segment : collection) { + for (JsonCollection.Document doc : segment) { + indexer.addDocument(doc.raw()); + cnt++; + } + segment.close(); + } + + return cnt; + } + } + + @Test + public void testBasic1() throws Exception { + Path tempDir = createTempDir(); + SimpleIndexer indexer; + int cnt; + + SimpleSearcher searcher; + SimpleSearcher.Result[] hits; + + indexer = new SimpleIndexer(tempDir.toString()); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("1", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + + // We're going to overwrite the index. + indexer = new SimpleIndexer(tempDir.toString()); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection4").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("contains", 10); + assertEquals(1, hits.length); + assertEquals("doc3", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + + // We're going to append to the index. + indexer = new SimpleIndexer(tempDir.toString(), true); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(4, searcher.get_total_num_docs()); + hits = searcher.search("contains", 10); + assertEquals(1, hits.length); + assertEquals("doc3", hits[0].docid); + assertEquals(0.5960, hits[0].score, 1e-4); + + hits = searcher.search("1", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.6764, hits[0].score, 1e-4); + searcher.close(); + } + + @Test + public void testBasic2() throws Exception { + Path tempDir = createTempDir(); + SimpleIndexer indexer; + int cnt; + + SimpleSearcher searcher; + SimpleSearcher.Result[] hits; + + indexer = new SimpleIndexer(tempDir.toString()); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("1", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + + // We're going to overwrite the index, but with different constructor. + indexer = new SimpleIndexer(tempDir.toString(), false); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection4").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("contains", 10); + assertEquals(1, hits.length); + assertEquals("doc3", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + } + + @Test + public void testBasic3() throws Exception { + Path tempDir = createTempDir(); + SimpleIndexer indexer; + int cnt; + + SimpleSearcher searcher; + SimpleSearcher.Result[] hits; + + // Make sure appending to a non-existent is okay. + indexer = new SimpleIndexer(tempDir.toString(), true); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("1", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + } + + @Test + public void testInitWithArgs() throws Exception { + Path tempDir = createTempDir(); + SimpleIndexer indexer; + int cnt; + + SimpleSearcher searcher; + SimpleSearcher.Result[] hits; + + indexer = new SimpleIndexer(new String[] { + "-input", "", + "-index", tempDir.toString(), + "-collection", "JsonCollection", + "-language", "sw", + "-storePositions", "-storeDocvectors", "-storeRaw", + }); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + searcher.set_language("sw"); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("1.", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + + // We're going to overwrite the index. + indexer = new SimpleIndexer(new String[] { + "-input", "", + "-index", tempDir.toString(), + "-collection", "JsonCollection", + "-language", "sw", + "-storePositions", "-storeDocvectors", "-storeRaw", + }); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection4").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + searcher.set_language("sw"); + assertEquals(2, searcher.get_total_num_docs()); + hits = searcher.search("contains", 10); + assertEquals(1, hits.length); + assertEquals("doc3", hits[0].docid); + assertEquals(0.3648, hits[0].score, 1e-4); + searcher.close(); + + // We're going to append to the index. + indexer = new SimpleIndexer(new String[] { + "-input", "", + "-index", tempDir.toString(), + "-collection", "JsonCollection", + "-language", "sw", + "-storePositions", "-storeDocvectors", "-storeRaw", "-append" + }); + cnt = new JsonCollectionWrapper("src/test/resources/sample_docs/json/collection3").indexWith(indexer); + indexer.close(); + + assertEquals(2, cnt); + + searcher = new SimpleSearcher(tempDir.toString()); + searcher.set_language("sw"); + assertEquals(4, searcher.get_total_num_docs()); + hits = searcher.search("contains", 10); + assertEquals(1, hits.length); + assertEquals("doc3", hits[0].docid); + assertEquals(0.6473, hits[0].score, 1e-4); + + hits = searcher.search("1.", 10); + assertEquals(1, hits.length); + assertEquals("doc1", hits[0].docid); + assertEquals(0.6206, hits[0].score, 1e-4); + searcher.close(); + } +} diff --git a/src/test/java/io/anserini/index/SimpleIndexerTest.java b/src/test/java/io/anserini/index/SimpleIndexerTest.java index 586958f7f9..edfb137920 100644 --- a/src/test/java/io/anserini/index/SimpleIndexerTest.java +++ b/src/test/java/io/anserini/index/SimpleIndexerTest.java @@ -29,14 +29,14 @@ public class SimpleIndexerTest extends LuceneTestCase { @Test - public void testBasic() throws IOException { + public void testBasic() throws Exception { Path tempDir = createTempDir(); Path collectionPath = Paths.get("src/test/resources/sample_docs/json/collection3"); JsonCollection collection = new JsonCollection(collectionPath); + SimpleIndexer indexer = new SimpleIndexer(tempDir.toString()); int cnt = 0; - SimpleIndexer indexer = new SimpleIndexer(tempDir.toString()); for (FileSegment segment : collection ) { for (JsonCollection.Document doc : segment) { indexer.addDocument(doc.raw()); @@ -61,22 +61,28 @@ public void testBasic() throws IOException { public void testInitWithArgs() throws Exception { Path tempDir = createTempDir(); - SimpleIndexer.main(new String[] { - "-input", - "src/test/resources/sample_docs/json/collection3", - "-index", - tempDir.toString(), - "-collection", - "JsonCollection", - "-threads", - "1", - "-storePositions", - "-storeDocvectors", - "-storeRaw", - "-language", - "sw", + Path collectionPath = Paths.get("src/test/resources/sample_docs/json/collection3"); + JsonCollection collection = new JsonCollection(collectionPath); + SimpleIndexer indexer = new SimpleIndexer(new String[] { + "-input", "", + "-index", tempDir.toString(), + "-collection", "JsonCollection", + "-language", "sw", + "-storePositions", "-storeDocvectors", "-storeRaw", }); + int cnt = 0; + for (FileSegment segment : collection ) { + for (JsonCollection.Document doc : segment) { + indexer.addDocument(doc.raw()); + cnt++; + } + segment.close(); + } + + indexer.close(); + assertEquals(2, cnt); + SimpleSearcher searcher = new SimpleSearcher(tempDir.toString()); // Set language to sw so that same Analyzer is used for indexing & searching searcher.set_language("sw"); diff --git a/src/test/resources/sample_docs/json/collection4/segment1.json b/src/test/resources/sample_docs/json/collection4/segment1.json new file mode 100644 index 0000000000..207521aed1 --- /dev/null +++ b/src/test/resources/sample_docs/json/collection4/segment1.json @@ -0,0 +1,12 @@ +{ + "id": "doc3", + "contents": "third document contains C", + "field1": "three field1", + "field2": "3 field2" +} +{ + "id": "doc4", + "contents": "fourth doc goes D", + "field1": "four four field_one", + "field2": "fourth field2 contents" +}