LUCENE-2223: add a ShingleFilter benchmark

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@904371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-29 04:07:47 +00:00
parent 1e9cdab0f9
commit d7dada3c3a
7 changed files with 464 additions and 0 deletions

View File

@ -2,6 +2,11 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
1/28/2010
LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
analyzer with ShingleAnalyzerWrapper and specify shingle parameters
with the NewShingleAnalyzer task. (Steven Rowe via Robert Muir)
1/14/2010 1/14/2010
LUCENE-2210: TrecTopicsReader now properly reads descriptions and LUCENE-2210: TrecTopicsReader now properly reads descriptions and
narratives from trec topics files. (Robert Muir) narratives from trec topics files. (Robert Muir)

View File

@ -131,6 +131,7 @@
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/> <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
<pathelement path="${common.dir}/build/contrib/memory/classes/java"/> <pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
<pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/> <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
<pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
<fileset dir="lib"> <fileset dir="lib">
<include name="**/*.jar"/> <include name="**/*.jar"/>
</fileset> </fileset>
@ -192,6 +193,32 @@
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo> <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
</target> </target>
<property name="shingle.alg.file" location="conf/shingle.alg"/>
<property name="shingle.output.file"
value="${working.dir}/shingle.benchmark.output.txt"/>
<property name="shingle.jira.output.file"
value="${working.dir}/shingle.bm2jira.output.txt"/>
<path id="shingle.runtime.classpath">
<path refid="run.classpath"/>
</path>
<target name="shingle" depends="compile,compile-analyzers-common,get-files">
<echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
maxmemory="${task.mem}" output="${shingle.output.file}">
<classpath refid="run.classpath"/>
<arg file="${shingle.alg.file}"/>
</java>
<echo>Benchmark output is in file: ${shingle.output.file}</echo>
<echo>Converting to JIRA table format...</echo>
<exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
<arg value="scripts/shingle.bm2jira.pl"/>
<arg value="${shingle.output.file}"/>
</exec>
<echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
</target>
<target name="compile-demo"> <target name="compile-demo">
<subant target="compile-demo"> <subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/> <fileset dir="${common.dir}" includes="build.xml"/>
@ -207,6 +234,11 @@
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/> <fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
</subant> </subant>
</target> </target>
<target name="compile-analyzers-common">
<subant target="compile">
<fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
</subant>
</target>
<target name="compile-memory"> <target name="compile-memory">
<subant target="compile"> <subant target="compile">
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/> <fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>

View File

@ -0,0 +1,48 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
doc.tokenized=false
doc.body.tokenized=true
docs.dir=reuters-out
log.step=1000
{ "Rounds"
-NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
-ResetInputs
{ "BigramsAndUnigrams" { ReadTokens > : 10000 }
-NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
-ResetInputs
{ "BigramsOnly" { ReadTokens > : 10000 }
-NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
-ResetInputs
{ "FourgramsAndUnigrams" { ReadTokens > : 10000 }
-NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
-ResetInputs
{ "FourgramsOnly" { ReadTokens > : 10000 }
-NewAnalyzer(standard.StandardAnalyzer)
-ResetInputs
{ "UnigramsOnly" { ReadTokens > : 10000 }
NewRound
} : 5
RepSumByNameRound

View File

@ -0,0 +1,116 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ------------------------------------------
# compare.shingle.benchmark.jira.tables.pl
#
# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
# by shingle.bm2jira.pl (located in the same directory as this script), and
# outputs a third JIRA-formatted comparison table.
#
# The difference is calculated as a percentage:
#
# 100 * (unpatched-elapsed - patched-elapsed / patched-elapsed)
#
# where (un)patched-elapsed values have had the no-shingle-filter
# (StandardAnalyzer) elapsed time subtracted from them.
#
#
# Example shingle.bm2jira.pl output:
# ----------------------------------
# JAVA:
# java version "1.5.0_15"
# Java(TM) 2 Runtime Environment, Standard Edition (build 1.5.0_15-b04)
# Java HotSpot(TM) 64-Bit Server VM (build 1.5.0_15-b04, mixed mode)
#
# OS:
# cygwin
# WinVistaService Pack 2
# Service Pack 26060022202561
#
# ||Max Shingle Size||Unigrams?||Elapsed||
# |1 (Unigrams)|yes|2.19s|
# |2|no|4.74s|
# |2|yes|4.90s|
# |4|no|5.82s|
# |4|yes|5.97s|
use strict;
use warnings;
my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
my %stats = ();
open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
my $table_encountered = 0;
my $standard_analyzer_elapsed = 0;
my %unpatched_stats = ();
my %patched_stats = ();
while (<UNPATCHED>) {
unless ($table_encountered) {
if (/\Q||Max Shingle Size||Unigrams?||Elapsed||\E/) {
$table_encountered = 1;
} else {
print;
}
} elsif (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
my $max_shingle_size = $1;
my $output_unigrams = $2;
my $elapsed = $3;
if ($max_shingle_size =~ /Unigrams/) {
$standard_analyzer_elapsed = $elapsed;
} else {
$unpatched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
}
}
}
close UNPATCHED;
open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
while (<PATCHED>) {
if (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
my $max_shingle_size = $1;
my $output_unigrams = $2;
my $elapsed = $3;
if ($max_shingle_size =~ /Unigrams/) {
$standard_analyzer_elapsed = $elapsed
if ($elapsed < $standard_analyzer_elapsed);
} else {
$patched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
}
}
}
close PATCHED;
print "||Max Shingle Size||Unigrams?||Unpatched||Patched||StandardAnalyzer||Improvement||\n";
for my $max_shingle_size (sort { $a <=> $b } keys %unpatched_stats) {
for my $output_unigrams (sort keys %{$unpatched_stats{$max_shingle_size}}) {
my $improvement
= ( $unpatched_stats{$max_shingle_size}{$output_unigrams}
- $patched_stats{$max_shingle_size}{$output_unigrams})
/ ( $patched_stats{$max_shingle_size}{$output_unigrams}
- $standard_analyzer_elapsed);
$improvement = int($improvement * 1000 + .5) / 10; # Round and truncate
printf "|$max_shingle_size|$output_unigrams"
."|$unpatched_stats{$max_shingle_size}{$output_unigrams}s"
."|$patched_stats{$max_shingle_size}{$output_unigrams}s"
."|${standard_analyzer_elapsed}s|%2.1f%%|\n", $improvement;
}
}

View File

@ -0,0 +1,73 @@
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ----------
# shingle.bm2jira.pl
#
# Converts Lucene contrib-benchmark output produced using the
# conf/shingle.alg file into a JIRA-formatted table.
#
use strict;
use warnings;
my %min_elapsed = ();
#Operation round runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
#BigramsAndUnigrams 0 1 255691 21,147.22 12.09 15,501,840 35,061,760
#BigramsOnly - - 0 - - 1 - - 127383 - 16,871.92 7.55 - 31,725,312 41,746,432
#FourgramsAndUnigrams
#FourgramsOnly
#UnigramsOnly
while (<>) {
if (/^((?:Uni|Bi|Four)grams\S+)[-\s]*([^\s{].*)/) {
my $operation = $1;
my $stats = $2;
my $max_shingle_size
= ($operation =~ /^Bigrams/ ? 2 : $operation =~ /^Unigrams/ ? 1 : 4);
my $output_unigrams
= ($operation =~ /(?:AndUnigrams|UnigramsOnly)$/ ? 'yes' : 'no');
my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
$min_elapsed{$max_shingle_size}{$output_unigrams} = $elapsed
unless (defined($min_elapsed{$max_shingle_size}{$output_unigrams})
&& $elapsed >= $min_elapsed{$max_shingle_size}{$output_unigrams});
}
}
# Print out platform info
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
if ($^O =~ /win/i) {
print "$^O\n";
eval {
require Win32;
print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
};
die "Error loading Win32: $@" if ($@);
} else {
print `uname -a 2>&1`;
}
print "\n||Max Shingle Size||Unigrams?||Elapsed||\n";
for my $max_shingle_size (sort { $a <=> $b } keys %min_elapsed) {
for my $output_unigrams (sort keys %{$min_elapsed{$max_shingle_size}}) {
my $size = (1 == $max_shingle_size ? '1 (Unigrams)' : $max_shingle_size);
printf "|$size|$output_unigrams|\%2.2fs|\n",
$min_elapsed{$max_shingle_size}{$output_unigrams};
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.util.Version;
/**
* Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
* <p>
* <ul>
* <li> <code>NewShingleAnalyzer</code> (constructs with all defaults)
* <li> <code>NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)</code>
* </ul>
* </p>
*/
public class NewShingleAnalyzerTask extends PerfTask {
private String analyzerClassName = "standard.StandardAnalyzer";
private static final String shingleAnalyzerClassName
= "org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper";
private int maxShingleSize = 2;
private boolean outputUnigrams = true;
public NewShingleAnalyzerTask(PerfRunData runData) {
super(runData);
}
private void setAnalyzer() throws Exception {
Class<? extends Analyzer> clazz = null;
Analyzer wrappedAnalyzer;
try {
if (analyzerClassName == null || analyzerClassName.equals("")) {
analyzerClassName
= "org.apache.lucene.analysis.standard.StandardAnalyzer";
}
if (analyzerClassName.indexOf(".") == -1
|| analyzerClassName.startsWith("standard.")) {
//there is no package name, assume o.a.l.analysis
analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
}
clazz = Class.forName(analyzerClassName).asSubclass(Analyzer.class);
// first try to use a ctor with version parameter (needed for many new
// Analyzers that have no default one anymore)
Constructor<? extends Analyzer> ctor = clazz.getConstructor(Version.class);
wrappedAnalyzer = ctor.newInstance(Version.LUCENE_CURRENT);
} catch (NoSuchMethodException e) {
// otherwise use default ctor
wrappedAnalyzer = clazz.newInstance();
}
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
analyzer.setOutputUnigrams(outputUnigrams);
getRunData().setAnalyzer(analyzer);
}
@Override
public int doLogic() throws Exception {
try {
setAnalyzer();
System.out.println
("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over"
+ analyzerClassName);
} catch (Exception e) {
throw new RuntimeException("Error creating Analyzer", e);
}
return 1;
}
@Override
public void setParams(String params) {
super.setParams(params);
StringTokenizer st = new StringTokenizer(params, ",");
while (st.hasMoreTokens()) {
String param = st.nextToken();
StringTokenizer expr = new StringTokenizer(param, ":");
String key = expr.nextToken();
String value = expr.nextToken();
if (key.equalsIgnoreCase("analyzer")) {
analyzerClassName = value;
} else if (key.equalsIgnoreCase("outputUnigrams")) {
outputUnigrams = Boolean.parseBoolean(value);
} else if (key.equalsIgnoreCase("maxShingleSize")) {
maxShingleSize = (int)Double.parseDouble(value);
} else {
throw new RuntimeException("Unknown parameter " + param);
}
}
}
@Override
public boolean supportsParams() {
return true;
}
}

View File

@ -975,6 +975,79 @@ public class TestPerfTasksLogic extends LuceneTestCase {
return algLines; return algLines;
} }
/**
* Test that we can create ShingleAnalyzerWrappers.
*/
public void testShingleAnalyzer() throws Exception {
String text = "one,two,three, four five six";
// Default analyzer, maxShingleSize, and outputUnigrams
Benchmark benchmark = execBenchmark(getShingleConfig(""));
TokenStream stream = benchmark.getRunData().getAnalyzer().tokenStream
("bogus", new StringReader(text));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] {"one", "one two", "two", "two three",
"three", "three four", "four", "four five",
"five", "five six", "six"});
// Default analyzer, maxShingleSize = 3, and outputUnigrams = false
benchmark = execBenchmark
(getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one two", "one two three", "two three",
"two three four", "three four",
"three four five", "four five",
"four five six", "five six" });
// WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
benchmark = execBenchmark
(getShingleConfig("analyzer:WhitespaceAnalyzer"));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one,two,three,", "one,two,three, four",
"four", "four five", "five", "five six",
"six" });
// WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
benchmark = execBenchmark
(getShingleConfig
("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one,two,three, four",
"one,two,three, four five",
"four five", "four five six",
"five six" });
}
private void assertEqualShingle
(Analyzer analyzer, String text, String[] expected) throws Exception {
TokenStream stream = analyzer.tokenStream("bogus", new StringReader(text));
stream.reset();
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
int termNum = 0;
while (stream.incrementToken()) {
assertTrue("Extra output term(s), starting with '"
+ new String(termAtt.termBuffer(), 0, termAtt.termLength()) + "'",
termNum < expected.length);
assertEquals("Mismatch in output term # " + termNum + " - ",
expected[termNum],
new String(termAtt.termBuffer(), 0, termAtt.termLength()));
++termNum;
}
assertEquals("Too few output terms", expected.length, termNum);
stream.close();
}
private static String[] getShingleConfig(String params) {
String algLines[] = {
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.forever=false",
"directory=RAMDirectory",
"NewShingleAnalyzer(" + params + ")",
"CreateIndex",
"{ \"AddDocs\" AddDoc > : * "
};
return algLines;
}
private static String getReuters20LinesFile() { private static String getReuters20LinesFile() {
return System.getProperty("lucene.common.dir").replace('\\','/') + return System.getProperty("lucene.common.dir").replace('\\','/') +
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt"; "/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";