Skip to content

Commit

Permalink
LUCENE-2181: add a benchmark for collation
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
rmuir committed Jan 12, 2010
1 parent aba4c8b commit eaed8e6
Show file tree
Hide file tree
Showing 10 changed files with 671 additions and 4 deletions.
10 changes: 10 additions & 0 deletions contrib/benchmark/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

$Id:$

1/11/2010
LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
which sets a Locale in the run data for collation to use, and can be
used in the future for benchmarking localized range queries and sorts.
Also add NewCollationAnalyzerTask, which works with both JDK and ICU
Collator implementations. Fix ReadTokensTask to not tokenize fields
unless they should be tokenized according to DocMaker config. The
easiest way to run the benchmark is to run 'ant collation'
(Steven Rowe via Robert Muir)

12/22/2009
LUCENE-2178: Allow multiple locations to add to the class path with
-Dbenchmark.ext.classpath=... when running "ant run-task" (Steven
Expand Down
59 changes: 58 additions & 1 deletion contrib/benchmark/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>

<available file="temp/${top.100k.words.archive.filename}"
property="top.100k.words.archive.present"/>
<available file="${working.dir}/top100k-out"
property="top.100k.word.files.expanded"/>
</target>

<target name="enwiki-files" depends="check-files">
Expand Down Expand Up @@ -94,6 +97,27 @@
<untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
</target>

<property name="top.100k.words.archive.filename"
value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
<property name="top.100k.words.archive.base.url"
value="http://people.apache.org/~rmuir/wikipedia"/>
<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
<mkdir dir="temp"/>
<get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
dest="temp/${top.100k.words.archive.filename}"/>
</target>
<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
<mkdir dir="${working.dir}/top100k-out"/>
<untar src="temp/${top.100k.words.archive.filename}"
overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
</target>

<target name="top-100k-wiki-word-files" depends="check-files">
<mkdir dir="${working.dir}"/>
<antcall target="get-top-100k-words-archive"/>
<antcall target="expand-top-100k-word-files"/>
</target>

<target name="get-files" depends="check-files">
<mkdir dir="temp"/>
<antcall target="get-reuters"/>
Expand Down Expand Up @@ -141,6 +165,34 @@
</java>
</target>

<property name="collation.alg.file" location="conf/collation.alg"/>
<property name="collation.output.file"
value="${working.dir}/collation.benchmark.output.txt"/>
<property name="collation.jira.output.file"
value="${working.dir}/collation.bm2jira.output.txt"/>

<path id="collation.runtime.classpath">
<path refid="run.classpath"/>
<pathelement path="${common.dir}/build/contrib/icu/classes/java"/>
<fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
</path>

<target name="collation" depends="compile,compile-icu,top-100k-wiki-word-files">
<echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
maxmemory="${task.mem}" output="${collation.output.file}">
<classpath refid="collation.runtime.classpath"/>
<arg file="${collation.alg.file}"/>
</java>
<echo>Benchmark output is in file: ${collation.output.file}</echo>
<echo>Converting to JIRA table format...</echo>
<exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
<arg value="scripts/collation.bm2jira.pl"/>
<arg value="${collation.output.file}"/>
</exec>
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
</target>

<target name="compile-demo">
<subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/>
Expand All @@ -151,6 +203,11 @@
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
</subant>
</target>
<target name="compile-icu">
<subant target="compile">
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
</subant>
</target>
<target name="compile-memory">
<subant target="compile">
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
Expand Down
97 changes: 97 additions & 0 deletions contrib/benchmark/conf/collation.alg
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
content.source.encoding=UTF-8
doc.tokenized=false
doc.body.tokenized=true
docs.file=work/top100k-out/top.fr.wikipedia.words.txt
content.source.forever=false
log.step=100000

{ "Rounds"
-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10

-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanKeyword" { ReadTokens > : * ResetInputs } : 10

-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10

-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10

-NewLocale(fr)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchJDK" { ReadTokens > : * ResetInputs } : 10

-NewLocale(de)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanJDK" { ReadTokens > : * ResetInputs } : 10

-NewLocale(uk)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10

-NewLocale(en)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishJDK" { ReadTokens > : * ResetInputs } : 10

-NewLocale(fr)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchICU" { ReadTokens > : * ResetInputs } : 10

-NewLocale(de)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanICU" { ReadTokens > : * ResetInputs } : 10

-NewLocale(uk)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianICU" { ReadTokens > : * ResetInputs } : 10

-NewLocale(en)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishICU" { ReadTokens > : * ResetInputs } : 10

NewRound

} : 5

RepSumByNameRound
63 changes: 63 additions & 0 deletions contrib/benchmark/scripts/collation.bm2jira.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ----------
# bm2jira.pl
#
# Converts Lucene contrib-benchmark output produced using the
# benchmark.collation.alg file into a JIRA-formatted table.
#

use strict;
use warnings;

my %min_elapsed = ();

while (<>) {
if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) {
my $lang = $1;
my $analyzer = $2;
my $stats = $3;
my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
$min_elapsed{$analyzer}{$lang} = $elapsed
unless (defined($min_elapsed{$analyzer}{$lang})
&& $elapsed >= $min_elapsed{$analyzer}{$lang});
}
}

# Print out platform info
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
if ($^O =~ /win/i) {
print "$^O\n";
eval {
require Win32;
print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
};
die "Error loading Win32: $@" if ($@);
} else {
print `uname -a 2>&1`;
}

print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";

for my $lang (sort keys %{$min_elapsed{ICU}}) {
my $ICU = $min_elapsed{ICU}{$lang};
my $JDK = $min_elapsed{JDK}{$lang};
my $keyword = $min_elapsed{Keyword}{$lang};
my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5);
printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved;
}
91 changes: 91 additions & 0 deletions contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ------------------------------------------
# compare.collation.benchmark.jira.tables.pl
#
# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
# by bm2jira.pl (located in the same directory as this script), and outputs a
# third JIRA-formatted comparison table, showing the differences between two
# benchmarking runs' java.text and ICU4J columns, after accounting for the
# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored.
#
# The difference is calculated as a percentage:
#
# 100 * (patched-rate - unpatched-rate / unpatched-rate)
#
# where the (un)patched-rate is:
#
# 1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time)
#

use strict;
use warnings;

my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";

die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);

my %stats = ();

open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
while (<UNPATCHED>) {
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
# |English|4.51s|2.47s|1.47s|204%|
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
= ($1, $2, $3, $4);
$stats{unpatched}{$lang}{jdk} = $jdk_elapsed;
$stats{unpatched}{$lang}{icu} = $icu_elapsed;
$stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
}
close UNPATCHED;

open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
while (<PATCHED>) {
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
# |English|4.51s|2.47s|1.47s|204%|
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
= ($1, $2, $3, $4);
$stats{patched}{$lang}{jdk} = $jdk_elapsed;
$stats{patched}{$lang}{icu} = $icu_elapsed;
$stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
}
close PATCHED;

print "||Language||java.text improvement||ICU4J improvement||\n";
for my $lang (sort keys %{$stats{unpatched}}) {
my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer};
my $jdk1 = $stats{unpatched}{$lang}{jdk};
my $jdk_diff1 = $jdk1 - $keyword_analyzer1;
my $icu1 = $stats{unpatched}{$lang}{icu};
my $icu_diff1 = $icu1 - $keyword_analyzer1;

my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer};
my $jdk2 = $stats{patched}{$lang}{jdk};
my $jdk_diff2 = $jdk2 - $keyword_analyzer2;
my $icu2 = $stats{patched}{$lang}{icu};
my $icu_diff2 = $icu2 - $keyword_analyzer2;

my $jdk_impr
= int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10;
my $icu_impr
= int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10;

printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr;
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
Expand Down Expand Up @@ -61,6 +62,7 @@ public class PerfRunData {
private Directory directory;
private Analyzer analyzer;
private DocMaker docMaker;
private Locale locale;

// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
private HashMap<Class<? extends ReadTask>,QueryMaker> readTaskQueryMaker;
Expand Down Expand Up @@ -244,6 +246,20 @@ public DocMaker getDocMaker() {
return docMaker;
}

/**
* @return the locale
*/
public Locale getLocale() {
return locale;
}

/**
* @param locale the locale to set
*/
public void setLocale(Locale locale) {
this.locale = locale;
}

/**
* @return Returns the config.
*/
Expand Down
Loading

0 comments on commit eaed8e6

Please sign in to comment.