LUCENE-2181: add a benchmark for collation

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-12 20:06:17 +00:00
parent aba4c8bed4
commit eaed8e67d2
10 changed files with 671 additions and 4 deletions

View File

@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
1/11/2010
LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
which sets a Locale in the run data for collation to use, and can be
used in the future for benchmarking localized range queries and sorts.
Also add NewCollationAnalyzerTask, which works with both JDK and ICU
Collator implementations. Fix ReadTokensTask to not tokenize fields
unless they should be tokenized according to DocMaker config. The
easiest way to run the benchmark is to run 'ant collation'
(Steven Rowe via Robert Muir)
12/22/2009
LUCENE-2178: Allow multiple locations to add to the class path with
-Dbenchmark.ext.classpath=... when running "ant run-task" (Steven

View File

@ -24,7 +24,10 @@
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
<available file="temp/${top.100k.words.archive.filename}"
property="top.100k.words.archive.present"/>
<available file="${working.dir}/top100k-out"
property="top.100k.word.files.expanded"/>
</target>
<target name="enwiki-files" depends="check-files">
@ -94,6 +97,27 @@
<untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
</target>
<property name="top.100k.words.archive.filename"
value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
<property name="top.100k.words.archive.base.url"
value="http://people.apache.org/~rmuir/wikipedia"/>
<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
<mkdir dir="temp"/>
<get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
dest="temp/${top.100k.words.archive.filename}"/>
</target>
<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
<mkdir dir="${working.dir}/top100k-out"/>
<untar src="temp/${top.100k.words.archive.filename}"
overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
</target>
<target name="top-100k-wiki-word-files" depends="check-files">
<mkdir dir="${working.dir}"/>
<antcall target="get-top-100k-words-archive"/>
<antcall target="expand-top-100k-word-files"/>
</target>
<target name="get-files" depends="check-files">
<mkdir dir="temp"/>
<antcall target="get-reuters"/>
@ -141,6 +165,34 @@
</java>
</target>
<property name="collation.alg.file" location="conf/collation.alg"/>
<property name="collation.output.file"
value="${working.dir}/collation.benchmark.output.txt"/>
<property name="collation.jira.output.file"
value="${working.dir}/collation.bm2jira.output.txt"/>
<path id="collation.runtime.classpath">
<path refid="run.classpath"/>
<pathelement path="${common.dir}/build/contrib/icu/classes/java"/>
<fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
</path>
<target name="collation" depends="compile,compile-icu,top-100k-wiki-word-files">
<echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
maxmemory="${task.mem}" output="${collation.output.file}">
<classpath refid="collation.runtime.classpath"/>
<arg file="${collation.alg.file}"/>
</java>
<echo>Benchmark output is in file: ${collation.output.file}</echo>
<echo>Converting to JIRA table format...</echo>
<exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
<arg value="scripts/collation.bm2jira.pl"/>
<arg value="${collation.output.file}"/>
</exec>
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
</target>
<target name="compile-demo">
<subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/>
@ -151,6 +203,11 @@
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
</subant>
</target>
<target name="compile-icu">
<subant target="compile">
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
</subant>
</target>
<target name="compile-memory">
<subant target="compile">
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>

View File

@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
content.source.encoding=UTF-8
doc.tokenized=false
doc.body.tokenized=true
docs.file=work/top100k-out/top.fr.wikipedia.words.txt
content.source.forever=false
log.step=100000
{ "Rounds"
-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
-NewAnalyzer(KeywordAnalyzer)
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
-NewLocale(fr)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
-NewLocale(de)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanJDK" { ReadTokens > : * ResetInputs } : 10
-NewLocale(uk)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
-NewLocale(en)
-NewCollationAnalyzer
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
-NewLocale(fr)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
ResetInputs
{ "FrenchICU" { ReadTokens > : * ResetInputs } : 10
-NewLocale(de)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
ResetInputs
{ "GermanICU" { ReadTokens > : * ResetInputs } : 10
-NewLocale(uk)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
ResetInputs
{ "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
-NewLocale(en)
-NewCollationAnalyzer(impl:icu)
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
ResetInputs
{ "EnglishICU" { ReadTokens > : * ResetInputs } : 10
NewRound
} : 5
RepSumByNameRound

View File

@ -0,0 +1,63 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ----------
# bm2jira.pl
#
# Converts Lucene contrib-benchmark output produced using the
# benchmark.collation.alg file into a JIRA-formatted table.
#
use strict;
use warnings;
my %min_elapsed = ();
while (<>) {
if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) {
my $lang = $1;
my $analyzer = $2;
my $stats = $3;
my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
$min_elapsed{$analyzer}{$lang} = $elapsed
unless (defined($min_elapsed{$analyzer}{$lang})
&& $elapsed >= $min_elapsed{$analyzer}{$lang});
}
}
# Print out platform info
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
if ($^O =~ /win/i) {
print "$^O\n";
eval {
require Win32;
print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
};
die "Error loading Win32: $@" if ($@);
} else {
print `uname -a 2>&1`;
}
print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";
for my $lang (sort keys %{$min_elapsed{ICU}}) {
my $ICU = $min_elapsed{ICU}{$lang};
my $JDK = $min_elapsed{JDK}{$lang};
my $keyword = $min_elapsed{Keyword}{$lang};
my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5);
printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved;
}

View File

@ -0,0 +1,91 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ------------------------------------------
# compare.collation.benchmark.jira.tables.pl
#
# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
# by bm2jira.pl (located in the same directory as this script), and outputs a
# third JIRA-formatted comparison table, showing the differences between two
# benchmarking runs' java.text and ICU4J columns, after accounting for the
# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored.
#
# The difference is calculated as a percentage:
#
# 100 * (patched-rate - unpatched-rate / unpatched-rate)
#
# where the (un)patched-rate is:
#
# 1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time)
#
use strict;
use warnings;
my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
my %stats = ();
open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
while (<UNPATCHED>) {
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
# |English|4.51s|2.47s|1.47s|204%|
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
= ($1, $2, $3, $4);
$stats{unpatched}{$lang}{jdk} = $jdk_elapsed;
$stats{unpatched}{$lang}{icu} = $icu_elapsed;
$stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
}
close UNPATCHED;
open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
while (<PATCHED>) {
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
# |English|4.51s|2.47s|1.47s|204%|
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
= ($1, $2, $3, $4);
$stats{patched}{$lang}{jdk} = $jdk_elapsed;
$stats{patched}{$lang}{icu} = $icu_elapsed;
$stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
}
close PATCHED;
print "||Language||java.text improvement||ICU4J improvement||\n";
for my $lang (sort keys %{$stats{unpatched}}) {
my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer};
my $jdk1 = $stats{unpatched}{$lang}{jdk};
my $jdk_diff1 = $jdk1 - $keyword_analyzer1;
my $icu1 = $stats{unpatched}{$lang}{icu};
my $icu_diff1 = $icu1 - $keyword_analyzer1;
my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer};
my $jdk2 = $stats{patched}{$lang}{jdk};
my $jdk_diff2 = $jdk2 - $keyword_analyzer2;
my $icu2 = $stats{patched}{$lang}{icu};
my $icu_diff2 = $icu2 - $keyword_analyzer2;
my $jdk_impr
= int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10;
my $icu_impr
= int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10;
printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr;
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@ -61,6 +62,7 @@ public class PerfRunData {
private Directory directory;
private Analyzer analyzer;
private DocMaker docMaker;
private Locale locale;
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
private HashMap<Class<? extends ReadTask>,QueryMaker> readTaskQueryMaker;
@ -244,6 +246,20 @@ public class PerfRunData {
return docMaker;
}
/**
* @return the locale
*/
public Locale getLocale() {
return locale;
}
/**
* @param locale the locale to set
*/
public void setLocale(Locale locale) {
this.locale = locale;
}
/**
* @return Returns the config.
*/

View File

@ -0,0 +1,117 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
/**
* Task to support benchmarking collation.
* <p>
* <ul>
* <li> <code>NewCollationAnalyzer</code> with the default jdk impl
* <li> <code>NewCollationAnalyzer(impl:icu)</code> specify an impl (jdk,icu)
* </ul>
* </p>
*/
public class NewCollationAnalyzerTask extends PerfTask {
public enum Implementation {
JDK("org.apache.lucene.collation.CollationKeyAnalyzer",
"java.text.Collator"),
ICU("org.apache.lucene.collation.ICUCollationKeyAnalyzer",
"com.ibm.icu.text.Collator");
String className;
String collatorClassName;
Implementation(String className, String collatorClassName) {
this.className = className;
this.collatorClassName = collatorClassName;
}
}
private Implementation impl = Implementation.JDK;
public NewCollationAnalyzerTask(PerfRunData runData) {
super(runData);
}
static Analyzer createAnalyzer(Locale locale, Implementation impl)
throws Exception {
final Class<?> collatorClazz = Class.forName(impl.collatorClassName);
Method collatorMethod = collatorClazz.getMethod("getInstance",
new Class[] {Locale.class});
Object collator = collatorMethod.invoke(null, locale);
final Class<? extends Analyzer> clazz = Class.forName(impl.className)
.asSubclass(Analyzer.class);
Constructor<? extends Analyzer> ctor = clazz.getConstructor(collatorClazz);
return ctor.newInstance(collator);
}
@Override
public int doLogic() throws Exception {
try {
Locale locale = getRunData().getLocale();
if (locale == null) throw new RuntimeException(
"Locale must be set with the NewLocale task!");
Analyzer analyzer = createAnalyzer(locale, impl);
getRunData().setAnalyzer(analyzer);
System.out.println("Changed Analyzer to: "
+ analyzer.getClass().getName() + "(" + locale + ")");
} catch (Exception e) {
throw new RuntimeException("Error creating Analyzer: impl=" + impl, e);
}
return 1;
}
@Override
public void setParams(String params) {
super.setParams(params);
StringTokenizer st = new StringTokenizer(params, ",");
while (st.hasMoreTokens()) {
String param = st.nextToken();
StringTokenizer expr = new StringTokenizer(param, ":");
String key = expr.nextToken();
String value = expr.nextToken();
// for now we only support the "impl" parameter.
// TODO: add strength, decomposition, etc
if (key.equals("impl")) {
if (value.equalsIgnoreCase("icu"))
impl = Implementation.ICU;
else if (value.equalsIgnoreCase("jdk"))
impl = Implementation.JDK;
else
throw new RuntimeException("Unknown parameter " + param);
} else {
throw new RuntimeException("Unknown parameter " + param);
}
}
}
@Override
public boolean supportsParams() {
return true;
}
}

View File

@ -0,0 +1,89 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
/**
* Set a {@link java.util.Locale} for use in benchmarking.
* <p>
* Locales can be specified in the following ways:
* <ul>
* <li><code>de</code>: Language "de"
* <li><code>en,US</code>: Language "en", country "US"
* <li><code>no,NO,NY</code>: Language "no", country "NO", variant "NY"
* <li><code>ROOT</code>: The root (language-agnostic) Locale
* <li>&lt;empty string&gt;: Erase the Locale (null)
* </ul>
* </p>
*/
public class NewLocaleTask extends PerfTask {
private String language;
private String country;
private String variant;
/**
* Create a new {@link java.util.Locale} and set it it in the getRunData() for
* use by all future tasks.
*/
public NewLocaleTask(PerfRunData runData) {
super(runData);
}
static Locale createLocale(String language, String country, String variant) {
if (language == null || language.length() == 0)
return null;
String lang = language;
if (lang.equalsIgnoreCase("ROOT"))
lang = ""; // empty language is the root locale in the JDK
return new Locale(lang, country, variant);
}
@Override
public int doLogic() throws Exception {
Locale locale = createLocale(language, country, variant);
getRunData().setLocale(locale);
System.out.println("Changed Locale to: " +
(locale == null ? "null" :
(locale.getDisplayName().length() == 0) ? "root locale" : locale));
return 1;
}
@Override
public void setParams(String params) {
super.setParams(params);
language = country = variant = "";
StringTokenizer st = new StringTokenizer(params, ",");
if (st.hasMoreTokens())
language = st.nextToken();
if (st.hasMoreTokens())
country = st.nextToken();
if (st.hasMoreTokens())
variant = st.nextToken();
}
@Override
public boolean supportsParams() {
return true;
}
}

View File

@ -67,6 +67,8 @@ public class ReadTokensTask extends PerfTask {
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(final Fieldable field : fields) {
if (!field.isTokenized()) continue;
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();

View File

@ -21,15 +21,23 @@ import java.io.StringReader;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.text.Collator;
import java.util.List;
import java.util.Iterator;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.SerialMergeScheduler;
@ -464,10 +472,14 @@ public class TestPerfTasksLogic extends LuceneTestCase {
TermDocs termDocs = reader.termDocs();
int totalTokenCount2 = 0;
while(terms.next()) {
Term term = terms.term();
/* not-tokenized, but indexed field */
if (term != null && term.field() != DocMaker.ID_FIELD) {
termDocs.seek(terms.term());
while(termDocs.next())
while (termDocs.next())
totalTokenCount2 += termDocs.freq();
}
}
reader.close();
// Make sure they are the same
@ -850,6 +862,119 @@ public class TestPerfTasksLogic extends LuceneTestCase {
};
}
/**
* Test that we can change the Locale in the runData,
* that it is parsed as we expect.
*/
public void testLocale() throws Exception {
// empty Locale: clear it (null)
Benchmark benchmark = execBenchmark(getLocaleConfig(""));
assertNull(benchmark.getRunData().getLocale());
// ROOT locale
benchmark = execBenchmark(getLocaleConfig("ROOT"));
assertEquals(new Locale(""), benchmark.getRunData().getLocale());
// specify just a language
benchmark = execBenchmark(getLocaleConfig("de"));
assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
// specify language + country
benchmark = execBenchmark(getLocaleConfig("en,US"));
assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
// specify language + country + variant
benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
}
private static String[] getLocaleConfig(String localeParam) {
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=RAMDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
/**
* Test that we can create CollationAnalyzers.
*/
public void testCollator() throws Exception {
// ROOT locale
Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
.getInstance(new Locale("")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify just a language
benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify language + country
benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
"US")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify language + country + variant
benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
"NO", "NY")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
}
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
throws Exception {
TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
ts1.reset();
ts2.reset();
TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class);
TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class);
assertTrue(ts1.incrementToken());
assertTrue(ts2.incrementToken());
assertEquals(termAtt1.term(), termAtt2.term());
assertFalse(ts1.incrementToken());
assertFalse(ts2.incrementToken());
ts1.close();
ts2.close();
}
private static String[] getCollatorConfig(String localeParam,
String collationParam) {
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=RAMDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" NewCollationAnalyzer(" + collationParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
private static String getReuters20LinesFile() {
return System.getProperty("lucene.common.dir").replace('\\','/') +
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";