Skip to content

Commit

Permalink
Add ability to append to existing indexes (castorini#2062)
Browse files Browse the repository at this point in the history
Requested feature here: castorini/pyserini#1443
  • Loading branch information
lintool authored and Thong Nguyen committed Mar 3, 2023
1 parent ab568f7 commit 2775842
Show file tree
Hide file tree
Showing 6 changed files with 370 additions and 86 deletions.
30 changes: 26 additions & 4 deletions src/main/java/io/anserini/analysis/AnalyzerMap.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@

package io.anserini.analysis;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;

import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;

public class AnalyzerMap {
public static final Map<String, String> analyzerMap = new HashMap<String, String>() {
private static final Logger LOG = LogManager.getLogger(AnalyzerMap.class);

public static final Map<String, String> analyzerMap = new HashMap<>() {
{
put("ar", "org.apache.lucene.analysis.ar.ArabicAnalyzer");
put("bn", "org.apache.lucene.analysis.bn.BengaliAnalyzer");
Expand Down Expand Up @@ -51,8 +57,24 @@ public class AnalyzerMap {
}
};

public static Analyzer getLanguageSpecificAnalyzer(String language) throws Exception {
String analyzerClazz = analyzerMap.get(language);
return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance();
public static Analyzer getLanguageSpecificAnalyzer(String language) {
String analyzerClazz = analyzerMap.get(language);

try {
return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance();
} catch (InstantiationException e) {
LOG.error(e);
} catch (IllegalAccessException e) {
LOG.error(e);
} catch (InvocationTargetException e) {
LOG.error(e);
} catch (NoSuchMethodException e) {
LOG.error(e);
} catch (ClassNotFoundException e) {
LOG.error(e);
}

// If we have any issues, eat the exception and return null.
return null;
}
}
20 changes: 11 additions & 9 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,12 @@ public static class Args {
usage = "Location of input collection.")
public String input;

@Option(name = "-threads", metaVar = "[num]", required = true,
usage = "Number of indexing threads.")
public int threads;

@Option(name = "-collection", metaVar = "[class]", required = true,
usage = "Collection class in package 'io.anserini.collection'.")
public String collectionClass;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";
@Option(name = "-index", metaVar = "[path]", usage = "Index path.", required = true)
public String index;

// optional general arguments

Expand All @@ -129,8 +124,15 @@ public static class Args {

// optional arguments

@Option(name = "-index", metaVar = "[path]", usage = "Index path.")
public String index;
@Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.")
public int threads = 8;

@Option(name = "-append", usage = "Append documents.")
public boolean append = false;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";

@Option(name = "-fields", handler = StringArrayOptionHandler.class,
usage = "List of fields to index (space separated), in addition to the default 'contents' field.")
Expand Down
102 changes: 45 additions & 57 deletions src/main/java/io/anserini/index/SimpleIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,19 @@

package io.anserini.index;

import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.analysis.AnalyzerMap;
import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.analysis.HuggingFaceTokenizerAnalyzer;
import io.anserini.index.IndexCollection.Args;
import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import io.anserini.collection.FileSegment;
import io.anserini.collection.JsonCollection;
import io.anserini.index.IndexCollection.Args;
import io.anserini.index.generator.GeneratorException;
import io.anserini.index.generator.LuceneDocumentGenerator;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
Expand All @@ -54,84 +53,73 @@ public class SimpleIndexer {
private final Analyzer analyzer;
private final LuceneDocumentGenerator generator;

public SimpleIndexer(String indexPath) throws IOException {
this.indexPath = Paths.get(indexPath);
if (!Files.exists(this.indexPath)) {
Files.createDirectories(this.indexPath);
}

analyzer = DefaultEnglishAnalyzer.newDefaultInstance();
generator = new DefaultLuceneDocumentGenerator();
final Directory dir = FSDirectory.open(this.indexPath);
final IndexWriterConfig config = new IndexWriterConfig(analyzer);

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(2048);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());
private static Args parseArgs(String[] argv) throws CmdLineException {
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args);
parser.parseArgument(argv);

writer = new IndexWriter(dir, config);
return args;
}

public SimpleIndexer(String[] argv) throws Exception {
this(parseArgs(argv));
}

private static Args parseArgs(String[] argv) throws CmdLineException {
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));
public SimpleIndexer(String indexPath) throws Exception {
this(new String[] {
"-input", "",
"-index", indexPath,
"-collection", "JsonCollection"});
}

try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + SimpleIndexer.class.getSimpleName() +
parser.printExample(OptionHandlerFilter.REQUIRED));
throw e;
}
return args;
public SimpleIndexer(String indexPath, boolean append) throws Exception {
// First line of constructor must be "this", which leads to a slightly awkward implementation.
this(append ? new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection", "-append"} :
new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection"});
}

@SuppressWarnings("unchecked")
public SimpleIndexer(Args args) throws Exception {
this.indexPath = Paths.get(args.index);
if (!Files.exists(this.indexPath)) {
Files.createDirectories(this.indexPath);
}
Class generatorClass = Class.forName("io.anserini.index.generator." + args.generatorClass);
generator = (LuceneDocumentGenerator)
generatorClass.getDeclaredConstructor(Args.class).newInstance(args);
generator = (LuceneDocumentGenerator) generatorClass.getDeclaredConstructor(Args.class).newInstance(args);
analyzer = getAnalyzer(args);

final Directory dir = FSDirectory.open(this.indexPath);
final IndexWriterConfig config = new IndexWriterConfig(analyzer);

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setOpenMode(args.append ? IndexWriterConfig.OpenMode.CREATE_OR_APPEND : IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(2048);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());

writer = new IndexWriter(dir, config);
}

private Analyzer getAnalyzer(Args args) throws Exception {
if (args.analyzeWithHuggingFaceTokenizer != null){
LOG.info("Bert Tokenizer");
return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer);
} else if (AnalyzerMap.analyzerMap.containsKey(args.language)){
LOG.info("Language: " + args.language);
return AnalyzerMap.getLanguageSpecificAnalyzer(args.language);
} else if (args.pretokenized || args.language.equals("sw")) {
LOG.info("Pretokenized");
return new WhitespaceAnalyzer();
} else {
// Default to English
LOG.info("Language: en");
LOG.info("Stemmer: " + args.stemmer);
LOG.info("Keep stopwords? " + args.keepStopwords);
LOG.info("Stopwords file: " + args.stopwords);
return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
private Analyzer getAnalyzer(Args args) {
try {
if (args.analyzeWithHuggingFaceTokenizer != null) {
LOG.info("Using HuggingFaceTokenizerAnalyzer");
return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer);
} else if (AnalyzerMap.analyzerMap.containsKey(args.language)) {
LOG.info("Using language-specific analyzer");
LOG.info("Language: " + args.language);
return AnalyzerMap.getLanguageSpecificAnalyzer(args.language);
} else if (args.pretokenized || args.language.equals("sw")) {
LOG.info("Using WhitespaceAnalyzer");
return new WhitespaceAnalyzer();
} else {
// Default to English
LOG.info("Using DefaultEnglishAnalyzer");
LOG.info("Stemmer: " + args.stemmer);
LOG.info("Keep stopwords? " + args.keepStopwords);
LOG.info("Stopwords file: " + args.stopwords);
return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
}
} catch (Exception e) {
return null;
}
}

Expand All @@ -141,10 +129,10 @@ public boolean addDocument(String raw) {
JsonCollection.Document doc = JsonCollection.Document.fromString(raw);
writer.addDocument(generator.createDocument(doc));
} catch (GeneratorException e) {
e.printStackTrace();
LOG.error(e);
return false;
} catch (IOException e) {
e.printStackTrace();
LOG.error(e);
return false;
}

Expand Down
Loading

0 comments on commit 2775842

Please sign in to comment.