From eaed8e67d2a6ac601a1cdf5cfcfc647e41740936 Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Tue, 12 Jan 2010 20:06:17 +0000
Subject: [PATCH] LUCENE-2181: add a benchmark for collation
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68
---
contrib/benchmark/CHANGES.txt | 10 ++
contrib/benchmark/build.xml | 59 +++++++-
contrib/benchmark/conf/collation.alg | 97 +++++++++++++
.../benchmark/scripts/collation.bm2jira.pl | 63 +++++++++
.../compare.collation.benchmark.tables.pl | 91 ++++++++++++
.../lucene/benchmark/byTask/PerfRunData.java | 16 +++
.../tasks/NewCollationAnalyzerTask.java | 117 ++++++++++++++++
.../benchmark/byTask/tasks/NewLocaleTask.java | 89 ++++++++++++
.../byTask/tasks/ReadTokensTask.java | 2 +
.../benchmark/byTask/TestPerfTasksLogic.java | 131 +++++++++++++++++-
10 files changed, 671 insertions(+), 4 deletions(-)
create mode 100644 contrib/benchmark/conf/collation.alg
create mode 100644 contrib/benchmark/scripts/collation.bm2jira.pl
create mode 100644 contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java
create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java
diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt
index 7ab24b170520..16a244c0176e 100644
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
+1/11/2010
+ LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
+ which sets a Locale in the run data for collation to use, and can be
+ used in the future for benchmarking localized range queries and sorts.
+ Also add NewCollationAnalyzerTask, which works with both JDK and ICU
+ Collator implementations. Fix ReadTokensTask to not tokenize fields
+ unless they should be tokenized according to DocMaker config. The
+ easiest way to run the benchmark is to run 'ant collation'
+ (Steven Rowe via Robert Muir)
+
12/22/2009
LUCENE-2178: Allow multiple locations to add to the class path with
-Dbenchmark.ext.classpath=... when running "ant run-task" (Steven
diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml
index 6c283afa4f03..12939955a055 100644
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@@ -24,7 +24,10 @@
-
+
+
@@ -94,6 +97,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -141,6 +165,34 @@
+
+
+
+
+
+
+
+
+
+
+
+ Running contrib/benchmark with alg file: ${collation.alg.file}
+
+
+
+
+ Benchmark output is in file: ${collation.output.file}
+ Converting to JIRA table format...
+
+
+
+
+ Benchmark output in JIRA table format is in file: ${collation.jira.output.file}
+
+
@@ -151,6 +203,11 @@
+
+
+
+
+
diff --git a/contrib/benchmark/conf/collation.alg b/contrib/benchmark/conf/collation.alg
new file mode 100644
index 000000000000..64fe6f32bca3
--- /dev/null
+++ b/contrib/benchmark/conf/collation.alg
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.encoding=UTF-8
+doc.tokenized=false
+doc.body.tokenized=true
+docs.file=work/top100k-out/top.fr.wikipedia.words.txt
+content.source.forever=false
+log.step=100000
+
+{ "Rounds"
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(fr)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(de)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(uk)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(en)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(fr)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(de)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(uk)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(en)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
+
+ NewRound
+
+} : 5
+
+RepSumByNameRound
diff --git a/contrib/benchmark/scripts/collation.bm2jira.pl b/contrib/benchmark/scripts/collation.bm2jira.pl
new file mode 100644
index 000000000000..b423f75ee8a8
--- /dev/null
+++ b/contrib/benchmark/scripts/collation.bm2jira.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ----------
+# bm2jira.pl
+#
+# Converts Lucene contrib-benchmark output produced using the
+# benchmark.collation.alg file into a JIRA-formatted table.
+#
+
+use strict;
+use warnings;
+
+my %min_elapsed = ();
+
+while (<>) {
+ if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) {
+ my $lang = $1;
+ my $analyzer = $2;
+ my $stats = $3;
+ my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
+ $min_elapsed{$analyzer}{$lang} = $elapsed
+ unless (defined($min_elapsed{$analyzer}{$lang})
+ && $elapsed >= $min_elapsed{$analyzer}{$lang});
+ }
+}
+
+# Print out platform info
+print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
+if ($^O =~ /win/i) {
+ print "$^O\n";
+ eval {
+ require Win32;
+ print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
+ };
+ die "Error loading Win32: $@" if ($@);
+} else {
+ print `uname -a 2>&1`;
+}
+
+print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";
+
+for my $lang (sort keys %{$min_elapsed{ICU}}) {
+ my $ICU = $min_elapsed{ICU}{$lang};
+ my $JDK = $min_elapsed{JDK}{$lang};
+ my $keyword = $min_elapsed{Keyword}{$lang};
+ my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5);
+ printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved;
+}
diff --git a/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl b/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
new file mode 100644
index 000000000000..bd941761b268
--- /dev/null
+++ b/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
@@ -0,0 +1,91 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ------------------------------------------
+# compare.collation.benchmark.jira.tables.pl
+#
+# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
+# by bm2jira.pl (located in the same directory as this script), and outputs a
+# third JIRA-formatted comparison table, showing the differences between two
+# benchmarking runs' java.text and ICU4J columns, after accounting for the
+# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored.
+#
+# The difference is calculated as a percentage:
+#
+# 100 * (patched-rate - unpatched-rate / unpatched-rate)
+#
+# where the (un)patched-rate is:
+#
+# 1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time)
+#
+
+use strict;
+use warnings;
+
+my $usage = "Usage: $0 \n";
+
+die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
+
+my %stats = ();
+
+open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
+while () {
+ # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
+ # |English|4.51s|2.47s|1.47s|204%|
+ next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
+ my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
+ = ($1, $2, $3, $4);
+ $stats{unpatched}{$lang}{jdk} = $jdk_elapsed;
+ $stats{unpatched}{$lang}{icu} = $icu_elapsed;
+ $stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
+}
+close UNPATCHED;
+
+open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
+while () {
+ # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
+ # |English|4.51s|2.47s|1.47s|204%|
+ next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
+ my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
+ = ($1, $2, $3, $4);
+ $stats{patched}{$lang}{jdk} = $jdk_elapsed;
+ $stats{patched}{$lang}{icu} = $icu_elapsed;
+ $stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
+}
+close PATCHED;
+
+print "||Language||java.text improvement||ICU4J improvement||\n";
+for my $lang (sort keys %{$stats{unpatched}}) {
+ my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer};
+ my $jdk1 = $stats{unpatched}{$lang}{jdk};
+ my $jdk_diff1 = $jdk1 - $keyword_analyzer1;
+ my $icu1 = $stats{unpatched}{$lang}{icu};
+ my $icu_diff1 = $icu1 - $keyword_analyzer1;
+
+ my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer};
+ my $jdk2 = $stats{patched}{$lang}{jdk};
+ my $jdk_diff2 = $jdk2 - $keyword_analyzer2;
+ my $icu2 = $stats{patched}{$lang}{icu};
+ my $icu_diff2 = $icu2 - $keyword_analyzer2;
+
+ my $jdk_impr
+ = int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10;
+ my $icu_impr
+ = int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10;
+
+ printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr;
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
index fa301cc25d88..64f5731f666c 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@@ -20,6 +20,7 @@
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
+import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@@ -61,6 +62,7 @@ public class PerfRunData {
private Directory directory;
private Analyzer analyzer;
private DocMaker docMaker;
+ private Locale locale;
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
private HashMap,QueryMaker> readTaskQueryMaker;
@@ -244,6 +246,20 @@ public DocMaker getDocMaker() {
return docMaker;
}
+ /**
+ * @return the locale
+ */
+ public Locale getLocale() {
+ return locale;
+ }
+
+ /**
+ * @param locale the locale to set
+ */
+ public void setLocale(Locale locale) {
+ this.locale = locale;
+ }
+
/**
* @return Returns the config.
*/
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java
new file mode 100644
index 000000000000..2dd29ece7227
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java
@@ -0,0 +1,117 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.util.Locale;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+
+/**
+ * Task to support benchmarking collation.
+ *
+ *
+ * -
NewCollationAnalyzer
with the default jdk impl
+ * -
NewCollationAnalyzer(impl:icu)
specify an impl (jdk,icu)
+ *
+ *
+ */
+public class NewCollationAnalyzerTask extends PerfTask {
+ public enum Implementation {
+ JDK("org.apache.lucene.collation.CollationKeyAnalyzer",
+ "java.text.Collator"),
+ ICU("org.apache.lucene.collation.ICUCollationKeyAnalyzer",
+ "com.ibm.icu.text.Collator");
+
+ String className;
+ String collatorClassName;
+
+ Implementation(String className, String collatorClassName) {
+ this.className = className;
+ this.collatorClassName = collatorClassName;
+ }
+ }
+
+ private Implementation impl = Implementation.JDK;
+
+ public NewCollationAnalyzerTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ static Analyzer createAnalyzer(Locale locale, Implementation impl)
+ throws Exception {
+ final Class> collatorClazz = Class.forName(impl.collatorClassName);
+ Method collatorMethod = collatorClazz.getMethod("getInstance",
+ new Class[] {Locale.class});
+ Object collator = collatorMethod.invoke(null, locale);
+
+ final Class extends Analyzer> clazz = Class.forName(impl.className)
+ .asSubclass(Analyzer.class);
+ Constructor extends Analyzer> ctor = clazz.getConstructor(collatorClazz);
+ return ctor.newInstance(collator);
+ }
+
+ @Override
+ public int doLogic() throws Exception {
+ try {
+ Locale locale = getRunData().getLocale();
+ if (locale == null) throw new RuntimeException(
+ "Locale must be set with the NewLocale task!");
+ Analyzer analyzer = createAnalyzer(locale, impl);
+ getRunData().setAnalyzer(analyzer);
+ System.out.println("Changed Analyzer to: "
+ + analyzer.getClass().getName() + "(" + locale + ")");
+ } catch (Exception e) {
+ throw new RuntimeException("Error creating Analyzer: impl=" + impl, e);
+ }
+ return 1;
+ }
+
+ @Override
+ public void setParams(String params) {
+ super.setParams(params);
+
+ StringTokenizer st = new StringTokenizer(params, ",");
+ while (st.hasMoreTokens()) {
+ String param = st.nextToken();
+ StringTokenizer expr = new StringTokenizer(param, ":");
+ String key = expr.nextToken();
+ String value = expr.nextToken();
+ // for now we only support the "impl" parameter.
+ // TODO: add strength, decomposition, etc
+ if (key.equals("impl")) {
+ if (value.equalsIgnoreCase("icu"))
+ impl = Implementation.ICU;
+ else if (value.equalsIgnoreCase("jdk"))
+ impl = Implementation.JDK;
+ else
+ throw new RuntimeException("Unknown parameter " + param);
+ } else {
+ throw new RuntimeException("Unknown parameter " + param);
+ }
+ }
+ }
+
+ @Override
+ public boolean supportsParams() {
+ return true;
+ }
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java
new file mode 100644
index 000000000000..196af26c7225
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java
@@ -0,0 +1,89 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Locale;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+
+/**
+ * Set a {@link java.util.Locale} for use in benchmarking.
+ *
+ * Locales can be specified in the following ways:
+ *
+ * de
: Language "de"
+ * en,US
: Language "en", country "US"
+ * no,NO,NY
: Language "no", country "NO", variant "NY"
+ * ROOT
: The root (language-agnostic) Locale
+ * - <empty string>: Erase the Locale (null)
+ *
+ *
+ */
+public class NewLocaleTask extends PerfTask {
+ private String language;
+ private String country;
+ private String variant;
+
+ /**
+ * Create a new {@link java.util.Locale} and set it it in the getRunData() for
+ * use by all future tasks.
+ */
+ public NewLocaleTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ static Locale createLocale(String language, String country, String variant) {
+ if (language == null || language.length() == 0)
+ return null;
+
+ String lang = language;
+ if (lang.equalsIgnoreCase("ROOT"))
+ lang = ""; // empty language is the root locale in the JDK
+
+ return new Locale(lang, country, variant);
+ }
+
+ @Override
+ public int doLogic() throws Exception {
+ Locale locale = createLocale(language, country, variant);
+ getRunData().setLocale(locale);
+ System.out.println("Changed Locale to: " +
+ (locale == null ? "null" :
+ (locale.getDisplayName().length() == 0) ? "root locale" : locale));
+ return 1;
+ }
+
+ @Override
+ public void setParams(String params) {
+ super.setParams(params);
+ language = country = variant = "";
+ StringTokenizer st = new StringTokenizer(params, ",");
+ if (st.hasMoreTokens())
+ language = st.nextToken();
+ if (st.hasMoreTokens())
+ country = st.nextToken();
+ if (st.hasMoreTokens())
+ variant = st.nextToken();
+ }
+
+ @Override
+ public boolean supportsParams() {
+ return true;
+ }
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
index f47d304b6586..5aed1eabae1e 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
@@ -67,6 +67,8 @@ public int doLogic() throws Exception {
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(final Fieldable field : fields) {
+ if (!field.isTokenized()) continue;
+
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
index 9bff7a120f62..3611b294c654 100755
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@@ -21,15 +21,23 @@
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
+import java.text.Collator;
import java.util.List;
import java.util.Iterator;
+import java.util.Locale;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.SerialMergeScheduler;
@@ -464,9 +472,13 @@ public void testReadTokens() throws Exception {
TermDocs termDocs = reader.termDocs();
int totalTokenCount2 = 0;
while(terms.next()) {
- termDocs.seek(terms.term());
- while(termDocs.next())
- totalTokenCount2 += termDocs.freq();
+ Term term = terms.term();
+ /* not-tokenized, but indexed field */
+ if (term != null && term.field() != DocMaker.ID_FIELD) {
+ termDocs.seek(terms.term());
+ while (termDocs.next())
+ totalTokenCount2 += termDocs.freq();
+ }
}
reader.close();
@@ -850,6 +862,119 @@ private static String[] disableCountingLines (boolean disable) {
};
}
+ /**
+ * Test that we can change the Locale in the runData,
+ * that it is parsed as we expect.
+ */
+ public void testLocale() throws Exception {
+ // empty Locale: clear it (null)
+ Benchmark benchmark = execBenchmark(getLocaleConfig(""));
+ assertNull(benchmark.getRunData().getLocale());
+
+ // ROOT locale
+ benchmark = execBenchmark(getLocaleConfig("ROOT"));
+ assertEquals(new Locale(""), benchmark.getRunData().getLocale());
+
+ // specify just a language
+ benchmark = execBenchmark(getLocaleConfig("de"));
+ assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
+
+ // specify language + country
+ benchmark = execBenchmark(getLocaleConfig("en,US"));
+ assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
+
+ // specify language + country + variant
+ benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
+ assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
+ }
+
+ private static String[] getLocaleConfig(String localeParam) {
+ String algLines[] = {
+ "# ----- properties ",
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+ "docs.file=" + getReuters20LinesFile(),
+ "content.source.log.step=3",
+ "content.source.forever=false",
+ "directory=RAMDirectory",
+ "# ----- alg ",
+ "{ \"Rounds\"",
+ " ResetSystemErase",
+ " NewLocale(" + localeParam + ")",
+ " CreateIndex",
+ " { \"AddDocs\" AddDoc > : * ",
+ " NewRound",
+ "} : 1",
+ };
+ return algLines;
+ }
+
+ /**
+ * Test that we can create CollationAnalyzers.
+ */
+ public void testCollator() throws Exception {
+ // ROOT locale
+ Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
+ CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
+ .getInstance(new Locale("")));
+ assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+
+ // specify just a language
+ benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
+ expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
+ assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+
+ // specify language + country
+ benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
+ expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
+ "US")));
+ assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+
+ // specify language + country + variant
+ benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
+ expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
+ "NO", "NY")));
+ assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+ }
+
+ private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
+ throws Exception {
+ TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
+ TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
+ ts1.reset();
+ ts2.reset();
+ TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class);
+ TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class);
+ assertTrue(ts1.incrementToken());
+ assertTrue(ts2.incrementToken());
+ assertEquals(termAtt1.term(), termAtt2.term());
+ assertFalse(ts1.incrementToken());
+ assertFalse(ts2.incrementToken());
+ ts1.close();
+ ts2.close();
+ }
+
+ private static String[] getCollatorConfig(String localeParam,
+ String collationParam) {
+ String algLines[] = {
+ "# ----- properties ",
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+ "docs.file=" + getReuters20LinesFile(),
+ "content.source.log.step=3",
+ "content.source.forever=false",
+ "directory=RAMDirectory",
+ "# ----- alg ",
+ "{ \"Rounds\"",
+ " ResetSystemErase",
+ " NewLocale(" + localeParam + ")",
+ " NewCollationAnalyzer(" + collationParam + ")",
+ " CreateIndex",
+ " { \"AddDocs\" AddDoc > : * ",
+ " NewRound",
+ "} : 1",
+ };
+ return algLines;
+ }
+
private static String getReuters20LinesFile() {
return System.getProperty("lucene.common.dir").replace('\\','/') +
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";