Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to append to existing index #2062

Merged
merged 4 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions src/main/java/io/anserini/analysis/AnalyzerMap.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@

package io.anserini.analysis;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;

import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;

public class AnalyzerMap {
public static final Map<String, String> analyzerMap = new HashMap<String, String>() {
private static final Logger LOG = LogManager.getLogger(AnalyzerMap.class);

public static final Map<String, String> analyzerMap = new HashMap<>() {
{
put("ar", "org.apache.lucene.analysis.ar.ArabicAnalyzer");
put("bn", "org.apache.lucene.analysis.bn.BengaliAnalyzer");
Expand Down Expand Up @@ -51,8 +57,24 @@ public class AnalyzerMap {
}
};

public static Analyzer getLanguageSpecificAnalyzer(String language) throws Exception {
String analyzerClazz = analyzerMap.get(language);
return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance();
public static Analyzer getLanguageSpecificAnalyzer(String language) {
String analyzerClazz = analyzerMap.get(language);

try {
return (Analyzer) Class.forName(analyzerClazz).getDeclaredConstructor().newInstance();
} catch (InstantiationException e) {
LOG.error(e);
} catch (IllegalAccessException e) {
LOG.error(e);
} catch (InvocationTargetException e) {
LOG.error(e);
} catch (NoSuchMethodException e) {
LOG.error(e);
} catch (ClassNotFoundException e) {
LOG.error(e);
}

// If we have any issues, eat the exception and return null.
return null;
}
}
20 changes: 11 additions & 9 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,12 @@ public static class Args {
usage = "Location of input collection.")
public String input;

@Option(name = "-threads", metaVar = "[num]", required = true,
usage = "Number of indexing threads.")
public int threads;

@Option(name = "-collection", metaVar = "[class]", required = true,
usage = "Collection class in package 'io.anserini.collection'.")
public String collectionClass;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";
@Option(name = "-index", metaVar = "[path]", usage = "Index path.", required = true)
public String index;

// optional general arguments

Expand All @@ -129,8 +124,15 @@ public static class Args {

// optional arguments

@Option(name = "-index", metaVar = "[path]", usage = "Index path.")
public String index;
@Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.")
public int threads = 8;

@Option(name = "-append", usage = "Append documents.")
public boolean append = false;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";

@Option(name = "-fields", handler = StringArrayOptionHandler.class,
usage = "List of fields to index (space separated), in addition to the default 'contents' field.")
Expand Down
102 changes: 45 additions & 57 deletions src/main/java/io/anserini/index/SimpleIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,19 @@

package io.anserini.index;

import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.analysis.AnalyzerMap;
import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.analysis.HuggingFaceTokenizerAnalyzer;
import io.anserini.index.IndexCollection.Args;
import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import io.anserini.collection.FileSegment;
import io.anserini.collection.JsonCollection;
import io.anserini.index.IndexCollection.Args;
import io.anserini.index.generator.GeneratorException;
import io.anserini.index.generator.LuceneDocumentGenerator;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
Expand All @@ -54,84 +53,73 @@ public class SimpleIndexer {
private final Analyzer analyzer;
private final LuceneDocumentGenerator generator;

public SimpleIndexer(String indexPath) throws IOException {
this.indexPath = Paths.get(indexPath);
if (!Files.exists(this.indexPath)) {
Files.createDirectories(this.indexPath);
}

analyzer = DefaultEnglishAnalyzer.newDefaultInstance();
generator = new DefaultLuceneDocumentGenerator();
final Directory dir = FSDirectory.open(this.indexPath);
final IndexWriterConfig config = new IndexWriterConfig(analyzer);

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(2048);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());
private static Args parseArgs(String[] argv) throws CmdLineException {
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args);
parser.parseArgument(argv);

writer = new IndexWriter(dir, config);
return args;
}

public SimpleIndexer(String[] argv) throws Exception {
this(parseArgs(argv));
}

private static Args parseArgs(String[] argv) throws CmdLineException {
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));
public SimpleIndexer(String indexPath) throws Exception {
this(new String[] {
"-input", "",
"-index", indexPath,
"-collection", "JsonCollection"});
}

try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + SimpleIndexer.class.getSimpleName() +
parser.printExample(OptionHandlerFilter.REQUIRED));
throw e;
}
return args;
public SimpleIndexer(String indexPath, boolean append) throws Exception {
// First line of constructor must be "this", which leads to a slightly awkward implementation.
this(append ? new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection", "-append"} :
new String[] {"-input", "", "-index", indexPath, "-collection", "JsonCollection"});
}

@SuppressWarnings("unchecked")
public SimpleIndexer(Args args) throws Exception {
this.indexPath = Paths.get(args.index);
if (!Files.exists(this.indexPath)) {
Files.createDirectories(this.indexPath);
}
Class generatorClass = Class.forName("io.anserini.index.generator." + args.generatorClass);
generator = (LuceneDocumentGenerator)
generatorClass.getDeclaredConstructor(Args.class).newInstance(args);
generator = (LuceneDocumentGenerator) generatorClass.getDeclaredConstructor(Args.class).newInstance(args);
analyzer = getAnalyzer(args);

final Directory dir = FSDirectory.open(this.indexPath);
final IndexWriterConfig config = new IndexWriterConfig(analyzer);

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setOpenMode(args.append ? IndexWriterConfig.OpenMode.CREATE_OR_APPEND : IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(2048);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());

writer = new IndexWriter(dir, config);
}

private Analyzer getAnalyzer(Args args) throws Exception {
if (args.analyzeWithHuggingFaceTokenizer != null){
LOG.info("Bert Tokenizer");
return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer);
} else if (AnalyzerMap.analyzerMap.containsKey(args.language)){
LOG.info("Language: " + args.language);
return AnalyzerMap.getLanguageSpecificAnalyzer(args.language);
} else if (args.pretokenized || args.language.equals("sw")) {
LOG.info("Pretokenized");
return new WhitespaceAnalyzer();
} else {
// Default to English
LOG.info("Language: en");
LOG.info("Stemmer: " + args.stemmer);
LOG.info("Keep stopwords? " + args.keepStopwords);
LOG.info("Stopwords file: " + args.stopwords);
return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
private Analyzer getAnalyzer(Args args) {
try {
if (args.analyzeWithHuggingFaceTokenizer != null) {
LOG.info("Using HuggingFaceTokenizerAnalyzer");
return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer);
} else if (AnalyzerMap.analyzerMap.containsKey(args.language)) {
LOG.info("Using language-specific analyzer");
LOG.info("Language: " + args.language);
return AnalyzerMap.getLanguageSpecificAnalyzer(args.language);
} else if (args.pretokenized || args.language.equals("sw")) {
LOG.info("Using WhitespaceAnalyzer");
return new WhitespaceAnalyzer();
} else {
// Default to English
LOG.info("Using DefaultEnglishAnalyzer");
LOG.info("Stemmer: " + args.stemmer);
LOG.info("Keep stopwords? " + args.keepStopwords);
LOG.info("Stopwords file: " + args.stopwords);
return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
}
} catch (Exception e) {
return null;
}
}

Expand All @@ -141,10 +129,10 @@ public boolean addDocument(String raw) {
JsonCollection.Document doc = JsonCollection.Document.fromString(raw);
writer.addDocument(generator.createDocument(doc));
} catch (GeneratorException e) {
e.printStackTrace();
LOG.error(e);
return false;
} catch (IOException e) {
e.printStackTrace();
LOG.error(e);
return false;
}

Expand Down
Loading