mirror of https://github.com/apache/lucene.git
LUCENE-2181: add a benchmark for collation
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aba4c8bed4
commit
eaed8e67d2
|
@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
1/11/2010
|
||||
LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
|
||||
which sets a Locale in the run data for collation to use, and can be
|
||||
used in the future for benchmarking localized range queries and sorts.
|
||||
Also add NewCollationAnalyzerTask, which works with both JDK and ICU
|
||||
Collator implementations. Fix ReadTokensTask to not tokenize fields
|
||||
unless they should be tokenized according to DocMaker config. The
|
||||
easiest way to run the benchmark is to run 'ant collation'
|
||||
(Steven Rowe via Robert Muir)
|
||||
|
||||
12/22/2009
|
||||
LUCENE-2178: Allow multiple locations to add to the class path with
|
||||
-Dbenchmark.ext.classpath=... when running "ant run-task" (Steven
|
||||
|
|
|
@ -24,7 +24,10 @@
|
|||
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
|
||||
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
|
||||
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
|
||||
|
||||
<available file="temp/${top.100k.words.archive.filename}"
|
||||
property="top.100k.words.archive.present"/>
|
||||
<available file="${working.dir}/top100k-out"
|
||||
property="top.100k.word.files.expanded"/>
|
||||
</target>
|
||||
|
||||
<target name="enwiki-files" depends="check-files">
|
||||
|
@ -94,6 +97,27 @@
|
|||
<untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
|
||||
</target>
|
||||
|
||||
<property name="top.100k.words.archive.filename"
|
||||
value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
|
||||
<property name="top.100k.words.archive.base.url"
|
||||
value="http://people.apache.org/~rmuir/wikipedia"/>
|
||||
<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
|
||||
<mkdir dir="temp"/>
|
||||
<get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
|
||||
dest="temp/${top.100k.words.archive.filename}"/>
|
||||
</target>
|
||||
<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
|
||||
<mkdir dir="${working.dir}/top100k-out"/>
|
||||
<untar src="temp/${top.100k.words.archive.filename}"
|
||||
overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
|
||||
</target>
|
||||
|
||||
<target name="top-100k-wiki-word-files" depends="check-files">
|
||||
<mkdir dir="${working.dir}"/>
|
||||
<antcall target="get-top-100k-words-archive"/>
|
||||
<antcall target="expand-top-100k-word-files"/>
|
||||
</target>
|
||||
|
||||
<target name="get-files" depends="check-files">
|
||||
<mkdir dir="temp"/>
|
||||
<antcall target="get-reuters"/>
|
||||
|
@ -141,6 +165,34 @@
|
|||
</java>
|
||||
</target>
|
||||
|
||||
<property name="collation.alg.file" location="conf/collation.alg"/>
|
||||
<property name="collation.output.file"
|
||||
value="${working.dir}/collation.benchmark.output.txt"/>
|
||||
<property name="collation.jira.output.file"
|
||||
value="${working.dir}/collation.bm2jira.output.txt"/>
|
||||
|
||||
<path id="collation.runtime.classpath">
|
||||
<path refid="run.classpath"/>
|
||||
<pathelement path="${common.dir}/build/contrib/icu/classes/java"/>
|
||||
<fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
|
||||
</path>
|
||||
|
||||
<target name="collation" depends="compile,compile-icu,top-100k-wiki-word-files">
|
||||
<echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
|
||||
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
|
||||
maxmemory="${task.mem}" output="${collation.output.file}">
|
||||
<classpath refid="collation.runtime.classpath"/>
|
||||
<arg file="${collation.alg.file}"/>
|
||||
</java>
|
||||
<echo>Benchmark output is in file: ${collation.output.file}</echo>
|
||||
<echo>Converting to JIRA table format...</echo>
|
||||
<exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
|
||||
<arg value="scripts/collation.bm2jira.pl"/>
|
||||
<arg value="${collation.output.file}"/>
|
||||
</exec>
|
||||
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
|
||||
</target>
|
||||
|
||||
<target name="compile-demo">
|
||||
<subant target="compile-demo">
|
||||
<fileset dir="${common.dir}" includes="build.xml"/>
|
||||
|
@ -151,6 +203,11 @@
|
|||
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
<target name="compile-icu">
|
||||
<subant target="compile">
|
||||
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
<target name="compile-memory">
|
||||
<subant target="compile">
|
||||
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
|
||||
content.source.encoding=UTF-8
|
||||
doc.tokenized=false
|
||||
doc.body.tokenized=true
|
||||
docs.file=work/top100k-out/top.fr.wikipedia.words.txt
|
||||
content.source.forever=false
|
||||
log.step=100000
|
||||
|
||||
{ "Rounds"
|
||||
-NewAnalyzer(KeywordAnalyzer)
|
||||
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewAnalyzer(KeywordAnalyzer)
|
||||
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewAnalyzer(KeywordAnalyzer)
|
||||
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewAnalyzer(KeywordAnalyzer)
|
||||
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(fr)
|
||||
-NewCollationAnalyzer
|
||||
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(de)
|
||||
-NewCollationAnalyzer
|
||||
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "GermanJDK" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(uk)
|
||||
-NewCollationAnalyzer
|
||||
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(en)
|
||||
-NewCollationAnalyzer
|
||||
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(fr)
|
||||
-NewCollationAnalyzer(impl:icu)
|
||||
-SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "FrenchICU" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(de)
|
||||
-NewCollationAnalyzer(impl:icu)
|
||||
-SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "GermanICU" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(uk)
|
||||
-NewCollationAnalyzer(impl:icu)
|
||||
-SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
-NewLocale(en)
|
||||
-NewCollationAnalyzer(impl:icu)
|
||||
-SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
|
||||
ResetInputs
|
||||
{ "EnglishICU" { ReadTokens > : * ResetInputs } : 10
|
||||
|
||||
NewRound
|
||||
|
||||
} : 5
|
||||
|
||||
RepSumByNameRound
|
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# ----------
|
||||
# bm2jira.pl
|
||||
#
|
||||
# Converts Lucene contrib-benchmark output produced using the
|
||||
# benchmark.collation.alg file into a JIRA-formatted table.
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my %min_elapsed = ();
|
||||
|
||||
while (<>) {
|
||||
if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) {
|
||||
my $lang = $1;
|
||||
my $analyzer = $2;
|
||||
my $stats = $3;
|
||||
my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
|
||||
$min_elapsed{$analyzer}{$lang} = $elapsed
|
||||
unless (defined($min_elapsed{$analyzer}{$lang})
|
||||
&& $elapsed >= $min_elapsed{$analyzer}{$lang});
|
||||
}
|
||||
}
|
||||
|
||||
# Print out platform info
|
||||
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
|
||||
if ($^O =~ /win/i) {
|
||||
print "$^O\n";
|
||||
eval {
|
||||
require Win32;
|
||||
print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
|
||||
};
|
||||
die "Error loading Win32: $@" if ($@);
|
||||
} else {
|
||||
print `uname -a 2>&1`;
|
||||
}
|
||||
|
||||
print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";
|
||||
|
||||
for my $lang (sort keys %{$min_elapsed{ICU}}) {
|
||||
my $ICU = $min_elapsed{ICU}{$lang};
|
||||
my $JDK = $min_elapsed{JDK}{$lang};
|
||||
my $keyword = $min_elapsed{Keyword}{$lang};
|
||||
my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5);
|
||||
printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved;
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# ------------------------------------------
|
||||
# compare.collation.benchmark.jira.tables.pl
|
||||
#
|
||||
# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
|
||||
# by bm2jira.pl (located in the same directory as this script), and outputs a
|
||||
# third JIRA-formatted comparison table, showing the differences between two
|
||||
# benchmarking runs' java.text and ICU4J columns, after accounting for the
|
||||
# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored.
|
||||
#
|
||||
# The difference is calculated as a percentage:
|
||||
#
|
||||
# 100 * (patched-rate - unpatched-rate / unpatched-rate)
|
||||
#
|
||||
# where the (un)patched-rate is:
|
||||
#
|
||||
# 1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time)
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
|
||||
|
||||
die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
|
||||
|
||||
my %stats = ();
|
||||
|
||||
open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
|
||||
while (<UNPATCHED>) {
|
||||
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
|
||||
# |English|4.51s|2.47s|1.47s|204%|
|
||||
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
|
||||
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
|
||||
= ($1, $2, $3, $4);
|
||||
$stats{unpatched}{$lang}{jdk} = $jdk_elapsed;
|
||||
$stats{unpatched}{$lang}{icu} = $icu_elapsed;
|
||||
$stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
|
||||
}
|
||||
close UNPATCHED;
|
||||
|
||||
open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
|
||||
while (<PATCHED>) {
|
||||
# ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
|
||||
# |English|4.51s|2.47s|1.47s|204%|
|
||||
next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
|
||||
my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
|
||||
= ($1, $2, $3, $4);
|
||||
$stats{patched}{$lang}{jdk} = $jdk_elapsed;
|
||||
$stats{patched}{$lang}{icu} = $icu_elapsed;
|
||||
$stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
|
||||
}
|
||||
close PATCHED;
|
||||
|
||||
print "||Language||java.text improvement||ICU4J improvement||\n";
|
||||
for my $lang (sort keys %{$stats{unpatched}}) {
|
||||
my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer};
|
||||
my $jdk1 = $stats{unpatched}{$lang}{jdk};
|
||||
my $jdk_diff1 = $jdk1 - $keyword_analyzer1;
|
||||
my $icu1 = $stats{unpatched}{$lang}{icu};
|
||||
my $icu_diff1 = $icu1 - $keyword_analyzer1;
|
||||
|
||||
my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer};
|
||||
my $jdk2 = $stats{patched}{$lang}{jdk};
|
||||
my $jdk_diff2 = $jdk2 - $keyword_analyzer2;
|
||||
my $icu2 = $stats{patched}{$lang}{icu};
|
||||
my $icu_diff2 = $icu2 - $keyword_analyzer2;
|
||||
|
||||
my $jdk_impr
|
||||
= int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10;
|
||||
my $icu_impr
|
||||
= int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10;
|
||||
|
||||
printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr;
|
||||
}
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
|
@ -61,6 +62,7 @@ public class PerfRunData {
|
|||
private Directory directory;
|
||||
private Analyzer analyzer;
|
||||
private DocMaker docMaker;
|
||||
private Locale locale;
|
||||
|
||||
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
|
||||
private HashMap<Class<? extends ReadTask>,QueryMaker> readTaskQueryMaker;
|
||||
|
@ -244,6 +246,20 @@ public class PerfRunData {
|
|||
return docMaker;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the locale
|
||||
*/
|
||||
public Locale getLocale() {
|
||||
return locale;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param locale the locale to set
|
||||
*/
|
||||
public void setLocale(Locale locale) {
|
||||
this.locale = locale;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the config.
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Locale;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
|
||||
/**
|
||||
* Task to support benchmarking collation.
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li> <code>NewCollationAnalyzer</code> with the default jdk impl
|
||||
* <li> <code>NewCollationAnalyzer(impl:icu)</code> specify an impl (jdk,icu)
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class NewCollationAnalyzerTask extends PerfTask {
|
||||
public enum Implementation {
|
||||
JDK("org.apache.lucene.collation.CollationKeyAnalyzer",
|
||||
"java.text.Collator"),
|
||||
ICU("org.apache.lucene.collation.ICUCollationKeyAnalyzer",
|
||||
"com.ibm.icu.text.Collator");
|
||||
|
||||
String className;
|
||||
String collatorClassName;
|
||||
|
||||
Implementation(String className, String collatorClassName) {
|
||||
this.className = className;
|
||||
this.collatorClassName = collatorClassName;
|
||||
}
|
||||
}
|
||||
|
||||
private Implementation impl = Implementation.JDK;
|
||||
|
||||
public NewCollationAnalyzerTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
static Analyzer createAnalyzer(Locale locale, Implementation impl)
|
||||
throws Exception {
|
||||
final Class<?> collatorClazz = Class.forName(impl.collatorClassName);
|
||||
Method collatorMethod = collatorClazz.getMethod("getInstance",
|
||||
new Class[] {Locale.class});
|
||||
Object collator = collatorMethod.invoke(null, locale);
|
||||
|
||||
final Class<? extends Analyzer> clazz = Class.forName(impl.className)
|
||||
.asSubclass(Analyzer.class);
|
||||
Constructor<? extends Analyzer> ctor = clazz.getConstructor(collatorClazz);
|
||||
return ctor.newInstance(collator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int doLogic() throws Exception {
|
||||
try {
|
||||
Locale locale = getRunData().getLocale();
|
||||
if (locale == null) throw new RuntimeException(
|
||||
"Locale must be set with the NewLocale task!");
|
||||
Analyzer analyzer = createAnalyzer(locale, impl);
|
||||
getRunData().setAnalyzer(analyzer);
|
||||
System.out.println("Changed Analyzer to: "
|
||||
+ analyzer.getClass().getName() + "(" + locale + ")");
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error creating Analyzer: impl=" + impl, e);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParams(String params) {
|
||||
super.setParams(params);
|
||||
|
||||
StringTokenizer st = new StringTokenizer(params, ",");
|
||||
while (st.hasMoreTokens()) {
|
||||
String param = st.nextToken();
|
||||
StringTokenizer expr = new StringTokenizer(param, ":");
|
||||
String key = expr.nextToken();
|
||||
String value = expr.nextToken();
|
||||
// for now we only support the "impl" parameter.
|
||||
// TODO: add strength, decomposition, etc
|
||||
if (key.equals("impl")) {
|
||||
if (value.equalsIgnoreCase("icu"))
|
||||
impl = Implementation.ICU;
|
||||
else if (value.equalsIgnoreCase("jdk"))
|
||||
impl = Implementation.JDK;
|
||||
else
|
||||
throw new RuntimeException("Unknown parameter " + param);
|
||||
} else {
|
||||
throw new RuntimeException("Unknown parameter " + param);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supportsParams() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Locale;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
|
||||
/**
|
||||
* Set a {@link java.util.Locale} for use in benchmarking.
|
||||
* <p>
|
||||
* Locales can be specified in the following ways:
|
||||
* <ul>
|
||||
* <li><code>de</code>: Language "de"
|
||||
* <li><code>en,US</code>: Language "en", country "US"
|
||||
* <li><code>no,NO,NY</code>: Language "no", country "NO", variant "NY"
|
||||
* <li><code>ROOT</code>: The root (language-agnostic) Locale
|
||||
* <li><empty string>: Erase the Locale (null)
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class NewLocaleTask extends PerfTask {
|
||||
private String language;
|
||||
private String country;
|
||||
private String variant;
|
||||
|
||||
/**
|
||||
* Create a new {@link java.util.Locale} and set it it in the getRunData() for
|
||||
* use by all future tasks.
|
||||
*/
|
||||
public NewLocaleTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
static Locale createLocale(String language, String country, String variant) {
|
||||
if (language == null || language.length() == 0)
|
||||
return null;
|
||||
|
||||
String lang = language;
|
||||
if (lang.equalsIgnoreCase("ROOT"))
|
||||
lang = ""; // empty language is the root locale in the JDK
|
||||
|
||||
return new Locale(lang, country, variant);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int doLogic() throws Exception {
|
||||
Locale locale = createLocale(language, country, variant);
|
||||
getRunData().setLocale(locale);
|
||||
System.out.println("Changed Locale to: " +
|
||||
(locale == null ? "null" :
|
||||
(locale.getDisplayName().length() == 0) ? "root locale" : locale));
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParams(String params) {
|
||||
super.setParams(params);
|
||||
language = country = variant = "";
|
||||
StringTokenizer st = new StringTokenizer(params, ",");
|
||||
if (st.hasMoreTokens())
|
||||
language = st.nextToken();
|
||||
if (st.hasMoreTokens())
|
||||
country = st.nextToken();
|
||||
if (st.hasMoreTokens())
|
||||
variant = st.nextToken();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supportsParams() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -67,6 +67,8 @@ public class ReadTokensTask extends PerfTask {
|
|||
Analyzer analyzer = getRunData().getAnalyzer();
|
||||
int tokenCount = 0;
|
||||
for(final Fieldable field : fields) {
|
||||
if (!field.isTokenized()) continue;
|
||||
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
||||
|
|
|
@ -21,15 +21,23 @@ import java.io.StringReader;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.text.Collator;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.collation.CollationKeyAnalyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
|
@ -464,10 +472,14 @@ public class TestPerfTasksLogic extends LuceneTestCase {
|
|||
TermDocs termDocs = reader.termDocs();
|
||||
int totalTokenCount2 = 0;
|
||||
while(terms.next()) {
|
||||
Term term = terms.term();
|
||||
/* not-tokenized, but indexed field */
|
||||
if (term != null && term.field() != DocMaker.ID_FIELD) {
|
||||
termDocs.seek(terms.term());
|
||||
while(termDocs.next())
|
||||
while (termDocs.next())
|
||||
totalTokenCount2 += termDocs.freq();
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
// Make sure they are the same
|
||||
|
@ -850,6 +862,119 @@ public class TestPerfTasksLogic extends LuceneTestCase {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that we can change the Locale in the runData,
|
||||
* that it is parsed as we expect.
|
||||
*/
|
||||
public void testLocale() throws Exception {
|
||||
// empty Locale: clear it (null)
|
||||
Benchmark benchmark = execBenchmark(getLocaleConfig(""));
|
||||
assertNull(benchmark.getRunData().getLocale());
|
||||
|
||||
// ROOT locale
|
||||
benchmark = execBenchmark(getLocaleConfig("ROOT"));
|
||||
assertEquals(new Locale(""), benchmark.getRunData().getLocale());
|
||||
|
||||
// specify just a language
|
||||
benchmark = execBenchmark(getLocaleConfig("de"));
|
||||
assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
|
||||
|
||||
// specify language + country
|
||||
benchmark = execBenchmark(getLocaleConfig("en,US"));
|
||||
assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
|
||||
|
||||
// specify language + country + variant
|
||||
benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
|
||||
assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
|
||||
}
|
||||
|
||||
private static String[] getLocaleConfig(String localeParam) {
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
"content.source.log.step=3",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"# ----- alg ",
|
||||
"{ \"Rounds\"",
|
||||
" ResetSystemErase",
|
||||
" NewLocale(" + localeParam + ")",
|
||||
" CreateIndex",
|
||||
" { \"AddDocs\" AddDoc > : * ",
|
||||
" NewRound",
|
||||
"} : 1",
|
||||
};
|
||||
return algLines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that we can create CollationAnalyzers.
|
||||
*/
|
||||
public void testCollator() throws Exception {
|
||||
// ROOT locale
|
||||
Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
|
||||
CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
|
||||
.getInstance(new Locale("")));
|
||||
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
|
||||
|
||||
// specify just a language
|
||||
benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
|
||||
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
|
||||
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
|
||||
|
||||
// specify language + country
|
||||
benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
|
||||
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
|
||||
"US")));
|
||||
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
|
||||
|
||||
// specify language + country + variant
|
||||
benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
|
||||
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
|
||||
"NO", "NY")));
|
||||
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
|
||||
}
|
||||
|
||||
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
|
||||
throws Exception {
|
||||
TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
|
||||
TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
|
||||
ts1.reset();
|
||||
ts2.reset();
|
||||
TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class);
|
||||
TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class);
|
||||
assertTrue(ts1.incrementToken());
|
||||
assertTrue(ts2.incrementToken());
|
||||
assertEquals(termAtt1.term(), termAtt2.term());
|
||||
assertFalse(ts1.incrementToken());
|
||||
assertFalse(ts2.incrementToken());
|
||||
ts1.close();
|
||||
ts2.close();
|
||||
}
|
||||
|
||||
private static String[] getCollatorConfig(String localeParam,
|
||||
String collationParam) {
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
"content.source.log.step=3",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"# ----- alg ",
|
||||
"{ \"Rounds\"",
|
||||
" ResetSystemErase",
|
||||
" NewLocale(" + localeParam + ")",
|
||||
" NewCollationAnalyzer(" + collationParam + ")",
|
||||
" CreateIndex",
|
||||
" { \"AddDocs\" AddDoc > : * ",
|
||||
" NewRound",
|
||||
"} : 1",
|
||||
};
|
||||
return algLines;
|
||||
}
|
||||
|
||||
private static String getReuters20LinesFile() {
|
||||
return System.getProperty("lucene.common.dir").replace('\\','/') +
|
||||
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";
|
||||
|
|
Loading…
Reference in New Issue