/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; import java.util.Locale; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; // java -cp /home/rmuir/workspace/lucene/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar StringSetBenchmark.java /home/rmuir/Downloads/allCountries.txt /home/rmuir/Downloads/testIndex -1 /** Benchmark set queries on lines of Geonames. */ public class StringSetBenchmark { public static void main(String args[]) throws Exception { if (args.length != 3) { System.err.println("Usage: StringSetBenchmark /path/to/geonames.txt /path/to/index/dir doc_limit(or -1 means index all lines)"); System.exit(2); } String geonamesDataPath = args[0]; String indexPath = args[1]; int docLimit = Integer.parseInt(args[2]); IOUtils.rm(Paths.get(indexPath)); try (FSDirectory dir = FSDirectory.open(Paths.get(indexPath))) { System.err.println("Now run indexing"); IndexWriterConfig config = new IndexWriterConfig(); try (IndexWriter iw = new IndexWriter(dir, config); LineNumberReader reader = new LineNumberReader(new InputStreamReader(Files.newInputStream(Paths.get(geonamesDataPath))))) { long t0 = System.nanoTime(); indexDocs(iw, reader, docLimit); System.out.println(String.format(Locale.ROOT, "Indexing time: %d msec", (System.nanoTime() - t0) / 1_000_000)); } System.err.println("Index files: " + Arrays.toString(dir.listAll())); try (DirectoryReader reader = DirectoryReader.open(dir)) { System.out.print("BIG_BIG: "); doBench(reader, BIG_BIG); System.out.print("MEDIUM_BIG: "); doBench(reader, MEDIUM_BIG); System.out.print("SMALL_BIG: "); doBench(reader, SMALL_BIG); System.out.println(); System.out.print("BIG_MEDIUM: "); doBench(reader, BIG_MEDIUM); System.out.print("MEDIUM_MEDIUM: "); doBench(reader, MEDIUM_MEDIUM); System.out.print("SMALL_MEDIUM: "); doBench(reader, SMALL_MEDIUM); } } System.out.println("dummy=" + DUMMY); } static void doBench(IndexReader reader, String[] queries) throws Exception { int iters = 300; // warmup for (int i = 0; i < iters; ++i) { getDocs(reader, queries, true); getDocs(reader, queries, false); } // Take the min across multiple runs to decrease noise long minDurationNS = Long.MAX_VALUE; for (int i = 0; i < iters; ++i) { long t0 = System.nanoTime(); getDocs(reader, queries, true); minDurationNS = Math.min(minDurationNS, System.nanoTime() - t0); } System.out.print(String.format(Locale.ROOT, "count=%.5f msec", minDurationNS / 1_000_000.)); minDurationNS = Long.MAX_VALUE; for (int i = 0; i < iters; ++i) { long t0 = System.nanoTime(); getDocs(reader, queries, false); minDurationNS = Math.min(minDurationNS, System.nanoTime() - t0); } System.out.println(String.format(Locale.ROOT, " search=%.5f msec", minDurationNS / 1_000_000.)); } static void indexDocs(IndexWriter iw, LineNumberReader reader, int docLimit) throws Exception { Document doc = new Document(); TextField name = new TextField("name", "", Field.Store.NO); doc.add(name); StringField admin2 = new StringField("admin2", "", Field.Store.NO); doc.add(admin2); SortedDocValuesField admin2dv = new SortedDocValuesField("admin2", new BytesRef()); doc.add(admin2dv); String line = null; while ((line = reader.readLine()) != null) { if (reader.getLineNumber() % 10000 == 0) { System.err.println("doc: " + reader.getLineNumber()); } if (docLimit != -1 && reader.getLineNumber() == docLimit) { break; } String values[] = line.split("\t"); name.setStringValue(values[1]); // we use Integer.parseInt to "fold" some values such as 020 -> 20 // at least it keeps it consistent with the NumSetBenchmark as far as how it is indexed. int val = 0; try { val = Integer.parseInt(values[11]); } catch (Exception e) {} String stringVal = Integer.toString(val); admin2.setStringValue(stringVal); admin2dv.setBytesValue(new BytesRef(stringVal)); iw.addDocument(doc); } } static int DUMMY; // for histogram: // cut -f 11 allCountries.txt | sort | uniq -c | sort -nr | more // "smaller numbers are more common" static final String[] BIG_BIG = new String[] { "la|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 28930 hits "de|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 23465 hits "saint|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 14870 hits "canyon|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 10946 hits }; static final String[] MEDIUM_BIG = new String[] { "hotel|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 4144 hits "del|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 3818 hits "les|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 2518 hits "plaza|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 1827 hits "parc|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 1434 hits "by|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 995 hits }; static final String[] SMALL_BIG = new String[] { "channel|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 495 hits "centre|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 403 hits "st|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 353 hits "imperial|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 123 hits "silent|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 23 hits "sant|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 4 hits "andorra|1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", // 3 hits }; static final String[] BIG_MEDIUM = new String[] { "la|21,22,23", // 2226 hits "de|21,22,23", // 1555 hits "saint|21,22,23", // 1571 hits "canyon|21,22,23", // 1552 hits }; static final String[] MEDIUM_MEDIUM = new String[] { "hotel|21,22,23", // 319 hits "del|21,22,23", // 227 hits "les|21,22,23", // 140 hits "plaza|21,22,23", // 7 hits "parc|21,22,23", // 64 hits "by|21,22,23", // 337 hits }; static final String[] SMALL_MEDIUM = new String[] { "channel|21,22,23", // 24 hits "centre|21,22,23", // 55 hits "st|21,22,23", // 4 hits "imperial|21,22,23", // 6 hits "silent|21,22,23", // 1 hits "sant|21,22,23", // 0 hits "andorra|21,22,23", // 62 hits }; static void getDocs(IndexReader reader, String[] queries, boolean doCount) throws IOException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); // benchmarking for (String textQuery : queries) { String parts[] = textQuery.split("\\|"); String intParts[] = parts[1].split(","); BytesRef queryTerms[] = new BytesRef[intParts.length]; for (int i = 0; i < intParts.length; i++) { queryTerms[i] = new BytesRef(intParts[i]); } BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new BooleanClause(new TermQuery(new Term("name", parts[0])), BooleanClause.Occur.MUST)); builder.add(new BooleanClause(new TermInSetQuery("admin2", queryTerms), BooleanClause.Occur.MUST)); if (doCount) { int hits = searcher.count(builder.build()); DUMMY += hits; } else { int hits = (int) searcher.search(builder.build(), 10).totalHits.value; DUMMY += hits; } } } }