From eaed8e67d2a6ac601a1cdf5cfcfc647e41740936 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 12 Jan 2010 20:06:17 +0000 Subject: [PATCH] LUCENE-2181: add a benchmark for collation git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 10 ++ contrib/benchmark/build.xml | 59 +++++++- contrib/benchmark/conf/collation.alg | 97 +++++++++++++ .../benchmark/scripts/collation.bm2jira.pl | 63 +++++++++ .../compare.collation.benchmark.tables.pl | 91 ++++++++++++ .../lucene/benchmark/byTask/PerfRunData.java | 16 +++ .../tasks/NewCollationAnalyzerTask.java | 117 ++++++++++++++++ .../benchmark/byTask/tasks/NewLocaleTask.java | 89 ++++++++++++ .../byTask/tasks/ReadTokensTask.java | 2 + .../benchmark/byTask/TestPerfTasksLogic.java | 131 +++++++++++++++++- 10 files changed, 671 insertions(+), 4 deletions(-) create mode 100644 contrib/benchmark/conf/collation.alg create mode 100644 contrib/benchmark/scripts/collation.bm2jira.pl create mode 100644 contrib/benchmark/scripts/compare.collation.benchmark.tables.pl create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 7ab24b170520..16a244c0176e 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +1/11/2010 + LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask, + which sets a Locale in the run data for collation to use, and can be + used in the future for benchmarking localized range queries and sorts. + Also add NewCollationAnalyzerTask, which works with both JDK and ICU + Collator implementations. Fix ReadTokensTask to not tokenize fields + unless they should be tokenized according to DocMaker config. The + easiest way to run the benchmark is to run 'ant collation' + (Steven Rowe via Robert Muir) + 12/22/2009 LUCENE-2178: Allow multiple locations to add to the class path with -Dbenchmark.ext.classpath=... when running "ant run-task" (Steven diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml index 6c283afa4f03..12939955a055 100644 --- a/contrib/benchmark/build.xml +++ b/contrib/benchmark/build.xml @@ -24,7 +24,10 @@ - + + @@ -94,6 +97,27 @@ + + + + + + + + + + + + + + + + + @@ -141,6 +165,34 @@ + + + + + + + + + + + + Running contrib/benchmark with alg file: ${collation.alg.file} + + + + + Benchmark output is in file: ${collation.output.file} + Converting to JIRA table format... + + + + + Benchmark output in JIRA table format is in file: ${collation.jira.output.file} + + @@ -151,6 +203,11 @@ + + + + + diff --git a/contrib/benchmark/conf/collation.alg b/contrib/benchmark/conf/collation.alg new file mode 100644 index 000000000000..64fe6f32bca3 --- /dev/null +++ b/contrib/benchmark/conf/collation.alg @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource +content.source.encoding=UTF-8 +doc.tokenized=false +doc.body.tokenized=true +docs.file=work/top100k-out/top.fr.wikipedia.words.txt +content.source.forever=false +log.step=100000 + +{ "Rounds" + -NewAnalyzer(KeywordAnalyzer) + -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt) + ResetInputs + { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10 + + -NewAnalyzer(KeywordAnalyzer) + -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt) + ResetInputs + { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10 + + -NewAnalyzer(KeywordAnalyzer) + -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt) + ResetInputs + { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10 + + -NewAnalyzer(KeywordAnalyzer) + -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt) + ResetInputs + { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(fr) + -NewCollationAnalyzer + -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt) + ResetInputs + { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(de) + -NewCollationAnalyzer + -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt) + ResetInputs + { "GermanJDK" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(uk) + -NewCollationAnalyzer + -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt) + ResetInputs + { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(en) + -NewCollationAnalyzer + -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt) + ResetInputs + { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(fr) + -NewCollationAnalyzer(impl:icu) + -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt) + ResetInputs + { "FrenchICU" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(de) + -NewCollationAnalyzer(impl:icu) + -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt) + ResetInputs + { "GermanICU" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(uk) + -NewCollationAnalyzer(impl:icu) + -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt) + ResetInputs + { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10 + + -NewLocale(en) + -NewCollationAnalyzer(impl:icu) + -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt) + ResetInputs + { "EnglishICU" { ReadTokens > : * ResetInputs } : 10 + + NewRound + +} : 5 + +RepSumByNameRound diff --git a/contrib/benchmark/scripts/collation.bm2jira.pl b/contrib/benchmark/scripts/collation.bm2jira.pl new file mode 100644 index 000000000000..b423f75ee8a8 --- /dev/null +++ b/contrib/benchmark/scripts/collation.bm2jira.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ---------- +# bm2jira.pl +# +# Converts Lucene contrib-benchmark output produced using the +# benchmark.collation.alg file into a JIRA-formatted table. +# + +use strict; +use warnings; + +my %min_elapsed = (); + +while (<>) { + if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) { + my $lang = $1; + my $analyzer = $2; + my $stats = $3; + my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/; + $min_elapsed{$analyzer}{$lang} = $elapsed + unless (defined($min_elapsed{$analyzer}{$lang}) + && $elapsed >= $min_elapsed{$analyzer}{$lang}); + } +} + +# Print out platform info +print "JAVA:\n", `java -version 2>&1`, "\nOS:\n"; +if ($^O =~ /win/i) { + print "$^O\n"; + eval { + require Win32; + print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n"; + }; + die "Error loading Win32: $@" if ($@); +} else { + print `uname -a 2>&1`; +} + +print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n"; + +for my $lang (sort keys %{$min_elapsed{ICU}}) { + my $ICU = $min_elapsed{ICU}{$lang}; + my $JDK = $min_elapsed{JDK}{$lang}; + my $keyword = $min_elapsed{Keyword}{$lang}; + my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5); + printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved; +} diff --git a/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl b/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl new file mode 100644 index 000000000000..bd941761b268 --- /dev/null +++ b/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl @@ -0,0 +1,91 @@ +#!/usr/bin/perl +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ------------------------------------------ +# compare.collation.benchmark.jira.tables.pl +# +# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced +# by bm2jira.pl (located in the same directory as this script), and outputs a +# third JIRA-formatted comparison table, showing the differences between two +# benchmarking runs' java.text and ICU4J columns, after accounting for the +# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored. +# +# The difference is calculated as a percentage: +# +# 100 * (patched-rate - unpatched-rate / unpatched-rate) +# +# where the (un)patched-rate is: +# +# 1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time) +# + +use strict; +use warnings; + +my $usage = "Usage: $0 \n"; + +die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]); + +my %stats = (); + +open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!"; +while () { + # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement|| + # |English|4.51s|2.47s|1.47s|204%| + next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/); + my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed) + = ($1, $2, $3, $4); + $stats{unpatched}{$lang}{jdk} = $jdk_elapsed; + $stats{unpatched}{$lang}{icu} = $icu_elapsed; + $stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed; +} +close UNPATCHED; + +open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!"; +while () { + # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement|| + # |English|4.51s|2.47s|1.47s|204%| + next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/); + my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed) + = ($1, $2, $3, $4); + $stats{patched}{$lang}{jdk} = $jdk_elapsed; + $stats{patched}{$lang}{icu} = $icu_elapsed; + $stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed; +} +close PATCHED; + +print "||Language||java.text improvement||ICU4J improvement||\n"; +for my $lang (sort keys %{$stats{unpatched}}) { + my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer}; + my $jdk1 = $stats{unpatched}{$lang}{jdk}; + my $jdk_diff1 = $jdk1 - $keyword_analyzer1; + my $icu1 = $stats{unpatched}{$lang}{icu}; + my $icu_diff1 = $icu1 - $keyword_analyzer1; + + my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer}; + my $jdk2 = $stats{patched}{$lang}{jdk}; + my $jdk_diff2 = $jdk2 - $keyword_analyzer2; + my $icu2 = $stats{patched}{$lang}{icu}; + my $icu_diff2 = $icu2 - $keyword_analyzer2; + + my $jdk_impr + = int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10; + my $icu_impr + = int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10; + + printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr; +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java index fa301cc25d88..64f5731f666c 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; +import java.util.Locale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; @@ -61,6 +62,7 @@ public class PerfRunData { private Directory directory; private Analyzer analyzer; private DocMaker docMaker; + private Locale locale; // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately. private HashMap,QueryMaker> readTaskQueryMaker; @@ -244,6 +246,20 @@ public DocMaker getDocMaker() { return docMaker; } + /** + * @return the locale + */ + public Locale getLocale() { + return locale; + } + + /** + * @param locale the locale to set + */ + public void setLocale(Locale locale) { + this.locale = locale; + } + /** * @return Returns the config. */ diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java new file mode 100644 index 000000000000..2dd29ece7227 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java @@ -0,0 +1,117 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; +import java.util.Locale; +import java.util.StringTokenizer; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.benchmark.byTask.PerfRunData; + +/** + * Task to support benchmarking collation. + *

+ *

    + *
  • NewCollationAnalyzer with the default jdk impl + *
  • NewCollationAnalyzer(impl:icu) specify an impl (jdk,icu) + *
+ *

+ */ +public class NewCollationAnalyzerTask extends PerfTask { + public enum Implementation { + JDK("org.apache.lucene.collation.CollationKeyAnalyzer", + "java.text.Collator"), + ICU("org.apache.lucene.collation.ICUCollationKeyAnalyzer", + "com.ibm.icu.text.Collator"); + + String className; + String collatorClassName; + + Implementation(String className, String collatorClassName) { + this.className = className; + this.collatorClassName = collatorClassName; + } + } + + private Implementation impl = Implementation.JDK; + + public NewCollationAnalyzerTask(PerfRunData runData) { + super(runData); + } + + static Analyzer createAnalyzer(Locale locale, Implementation impl) + throws Exception { + final Class collatorClazz = Class.forName(impl.collatorClassName); + Method collatorMethod = collatorClazz.getMethod("getInstance", + new Class[] {Locale.class}); + Object collator = collatorMethod.invoke(null, locale); + + final Class clazz = Class.forName(impl.className) + .asSubclass(Analyzer.class); + Constructor ctor = clazz.getConstructor(collatorClazz); + return ctor.newInstance(collator); + } + + @Override + public int doLogic() throws Exception { + try { + Locale locale = getRunData().getLocale(); + if (locale == null) throw new RuntimeException( + "Locale must be set with the NewLocale task!"); + Analyzer analyzer = createAnalyzer(locale, impl); + getRunData().setAnalyzer(analyzer); + System.out.println("Changed Analyzer to: " + + analyzer.getClass().getName() + "(" + locale + ")"); + } catch (Exception e) { + throw new RuntimeException("Error creating Analyzer: impl=" + impl, e); + } + return 1; + } + + @Override + public void setParams(String params) { + super.setParams(params); + + StringTokenizer st = new StringTokenizer(params, ","); + while (st.hasMoreTokens()) { + String param = st.nextToken(); + StringTokenizer expr = new StringTokenizer(param, ":"); + String key = expr.nextToken(); + String value = expr.nextToken(); + // for now we only support the "impl" parameter. + // TODO: add strength, decomposition, etc + if (key.equals("impl")) { + if (value.equalsIgnoreCase("icu")) + impl = Implementation.ICU; + else if (value.equalsIgnoreCase("jdk")) + impl = Implementation.JDK; + else + throw new RuntimeException("Unknown parameter " + param); + } else { + throw new RuntimeException("Unknown parameter " + param); + } + } + } + + @Override + public boolean supportsParams() { + return true; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java new file mode 100644 index 000000000000..196af26c7225 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java @@ -0,0 +1,89 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Locale; +import java.util.StringTokenizer; + +import org.apache.lucene.benchmark.byTask.PerfRunData; + +/** + * Set a {@link java.util.Locale} for use in benchmarking. + *

+ * Locales can be specified in the following ways: + *

    + *
  • de: Language "de" + *
  • en,US: Language "en", country "US" + *
  • no,NO,NY: Language "no", country "NO", variant "NY" + *
  • ROOT: The root (language-agnostic) Locale + *
  • <empty string>: Erase the Locale (null) + *
+ *

+ */ +public class NewLocaleTask extends PerfTask { + private String language; + private String country; + private String variant; + + /** + * Create a new {@link java.util.Locale} and set it it in the getRunData() for + * use by all future tasks. + */ + public NewLocaleTask(PerfRunData runData) { + super(runData); + } + + static Locale createLocale(String language, String country, String variant) { + if (language == null || language.length() == 0) + return null; + + String lang = language; + if (lang.equalsIgnoreCase("ROOT")) + lang = ""; // empty language is the root locale in the JDK + + return new Locale(lang, country, variant); + } + + @Override + public int doLogic() throws Exception { + Locale locale = createLocale(language, country, variant); + getRunData().setLocale(locale); + System.out.println("Changed Locale to: " + + (locale == null ? "null" : + (locale.getDisplayName().length() == 0) ? "root locale" : locale)); + return 1; + } + + @Override + public void setParams(String params) { + super.setParams(params); + language = country = variant = ""; + StringTokenizer st = new StringTokenizer(params, ","); + if (st.hasMoreTokens()) + language = st.nextToken(); + if (st.hasMoreTokens()) + country = st.nextToken(); + if (st.hasMoreTokens()) + variant = st.nextToken(); + } + + @Override + public boolean supportsParams() { + return true; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java index f47d304b6586..5aed1eabae1e 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java @@ -67,6 +67,8 @@ public int doLogic() throws Exception { Analyzer analyzer = getRunData().getAnalyzer(); int tokenCount = 0; for(final Fieldable field : fields) { + if (!field.isTokenized()) continue; + final TokenStream stream; final TokenStream streamValue = field.tokenStreamValue(); diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 9bff7a120f62..3611b294c654 100755 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -21,15 +21,23 @@ import java.io.File; import java.io.FileReader; import java.io.BufferedReader; +import java.text.Collator; import java.util.List; import java.util.Iterator; +import java.util.Locale; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask; import org.apache.lucene.benchmark.byTask.stats.TaskStats; +import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.SerialMergeScheduler; @@ -464,9 +472,13 @@ public void testReadTokens() throws Exception { TermDocs termDocs = reader.termDocs(); int totalTokenCount2 = 0; while(terms.next()) { - termDocs.seek(terms.term()); - while(termDocs.next()) - totalTokenCount2 += termDocs.freq(); + Term term = terms.term(); + /* not-tokenized, but indexed field */ + if (term != null && term.field() != DocMaker.ID_FIELD) { + termDocs.seek(terms.term()); + while (termDocs.next()) + totalTokenCount2 += termDocs.freq(); + } } reader.close(); @@ -850,6 +862,119 @@ private static String[] disableCountingLines (boolean disable) { }; } + /** + * Test that we can change the Locale in the runData, + * that it is parsed as we expect. + */ + public void testLocale() throws Exception { + // empty Locale: clear it (null) + Benchmark benchmark = execBenchmark(getLocaleConfig("")); + assertNull(benchmark.getRunData().getLocale()); + + // ROOT locale + benchmark = execBenchmark(getLocaleConfig("ROOT")); + assertEquals(new Locale(""), benchmark.getRunData().getLocale()); + + // specify just a language + benchmark = execBenchmark(getLocaleConfig("de")); + assertEquals(new Locale("de"), benchmark.getRunData().getLocale()); + + // specify language + country + benchmark = execBenchmark(getLocaleConfig("en,US")); + assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale()); + + // specify language + country + variant + benchmark = execBenchmark(getLocaleConfig("no,NO,NY")); + assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale()); + } + + private static String[] getLocaleConfig(String localeParam) { + String algLines[] = { + "# ----- properties ", + "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", + "docs.file=" + getReuters20LinesFile(), + "content.source.log.step=3", + "content.source.forever=false", + "directory=RAMDirectory", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " NewLocale(" + localeParam + ")", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + " NewRound", + "} : 1", + }; + return algLines; + } + + /** + * Test that we can create CollationAnalyzers. + */ + public void testCollator() throws Exception { + // ROOT locale + Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk")); + CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator + .getInstance(new Locale(""))); + assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); + + // specify just a language + benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk")); + expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de"))); + assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); + + // specify language + country + benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk")); + expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en", + "US"))); + assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); + + // specify language + country + variant + benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk")); + expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no", + "NO", "NY"))); + assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); + } + + private void assertEqualCollation(Analyzer a1, Analyzer a2, String text) + throws Exception { + TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text)); + TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text)); + ts1.reset(); + ts2.reset(); + TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class); + TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class); + assertTrue(ts1.incrementToken()); + assertTrue(ts2.incrementToken()); + assertEquals(termAtt1.term(), termAtt2.term()); + assertFalse(ts1.incrementToken()); + assertFalse(ts2.incrementToken()); + ts1.close(); + ts2.close(); + } + + private static String[] getCollatorConfig(String localeParam, + String collationParam) { + String algLines[] = { + "# ----- properties ", + "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", + "docs.file=" + getReuters20LinesFile(), + "content.source.log.step=3", + "content.source.forever=false", + "directory=RAMDirectory", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " NewLocale(" + localeParam + ")", + " NewCollationAnalyzer(" + collationParam + ")", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + " NewRound", + "} : 1", + }; + return algLines; + } + private static String getReuters20LinesFile() { return System.getProperty("lucene.common.dir").replace('\\','/') + "/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";