mirror of https://github.com/apache/lucene.git
LUCENE-2223: add a ShingleFilter benchmark
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@904371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e9cdab0f9
commit
d7dada3c3a
|
@ -2,6 +2,11 @@ Lucene Benchmark Contrib Change Log
|
||||||
|
|
||||||
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
||||||
|
|
||||||
|
1/28/2010
|
||||||
|
LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
|
||||||
|
analyzer with ShingleAnalyzerWrapper and specify shingle parameters
|
||||||
|
with the NewShingleAnalyzer task. (Steven Rowe via Robert Muir)
|
||||||
|
|
||||||
1/14/2010
|
1/14/2010
|
||||||
LUCENE-2210: TrecTopicsReader now properly reads descriptions and
|
LUCENE-2210: TrecTopicsReader now properly reads descriptions and
|
||||||
narratives from trec topics files. (Robert Muir)
|
narratives from trec topics files. (Robert Muir)
|
||||||
|
|
|
@ -131,6 +131,7 @@
|
||||||
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
||||||
<pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
|
<pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
|
||||||
<pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
|
<pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
|
||||||
|
<pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
|
||||||
<fileset dir="lib">
|
<fileset dir="lib">
|
||||||
<include name="**/*.jar"/>
|
<include name="**/*.jar"/>
|
||||||
</fileset>
|
</fileset>
|
||||||
|
@ -192,6 +193,32 @@
|
||||||
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
|
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<property name="shingle.alg.file" location="conf/shingle.alg"/>
|
||||||
|
<property name="shingle.output.file"
|
||||||
|
value="${working.dir}/shingle.benchmark.output.txt"/>
|
||||||
|
<property name="shingle.jira.output.file"
|
||||||
|
value="${working.dir}/shingle.bm2jira.output.txt"/>
|
||||||
|
|
||||||
|
<path id="shingle.runtime.classpath">
|
||||||
|
<path refid="run.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="shingle" depends="compile,compile-analyzers-common,get-files">
|
||||||
|
<echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
|
||||||
|
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
|
||||||
|
maxmemory="${task.mem}" output="${shingle.output.file}">
|
||||||
|
<classpath refid="run.classpath"/>
|
||||||
|
<arg file="${shingle.alg.file}"/>
|
||||||
|
</java>
|
||||||
|
<echo>Benchmark output is in file: ${shingle.output.file}</echo>
|
||||||
|
<echo>Converting to JIRA table format...</echo>
|
||||||
|
<exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
|
||||||
|
<arg value="scripts/shingle.bm2jira.pl"/>
|
||||||
|
<arg value="${shingle.output.file}"/>
|
||||||
|
</exec>
|
||||||
|
<echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="compile-demo">
|
<target name="compile-demo">
|
||||||
<subant target="compile-demo">
|
<subant target="compile-demo">
|
||||||
<fileset dir="${common.dir}" includes="build.xml"/>
|
<fileset dir="${common.dir}" includes="build.xml"/>
|
||||||
|
@ -207,6 +234,11 @@
|
||||||
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
|
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
|
||||||
</subant>
|
</subant>
|
||||||
</target>
|
</target>
|
||||||
|
<target name="compile-analyzers-common">
|
||||||
|
<subant target="compile">
|
||||||
|
<fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
|
||||||
|
</subant>
|
||||||
|
</target>
|
||||||
<target name="compile-memory">
|
<target name="compile-memory">
|
||||||
<subant target="compile">
|
<subant target="compile">
|
||||||
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
|
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||||
|
doc.tokenized=false
|
||||||
|
doc.body.tokenized=true
|
||||||
|
docs.dir=reuters-out
|
||||||
|
log.step=1000
|
||||||
|
|
||||||
|
{ "Rounds"
|
||||||
|
|
||||||
|
-NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
|
||||||
|
-ResetInputs
|
||||||
|
{ "BigramsAndUnigrams" { ReadTokens > : 10000 }
|
||||||
|
|
||||||
|
-NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
|
||||||
|
-ResetInputs
|
||||||
|
{ "BigramsOnly" { ReadTokens > : 10000 }
|
||||||
|
|
||||||
|
-NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
|
||||||
|
-ResetInputs
|
||||||
|
{ "FourgramsAndUnigrams" { ReadTokens > : 10000 }
|
||||||
|
|
||||||
|
-NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
|
||||||
|
-ResetInputs
|
||||||
|
{ "FourgramsOnly" { ReadTokens > : 10000 }
|
||||||
|
|
||||||
|
-NewAnalyzer(standard.StandardAnalyzer)
|
||||||
|
-ResetInputs
|
||||||
|
{ "UnigramsOnly" { ReadTokens > : 10000 }
|
||||||
|
|
||||||
|
NewRound
|
||||||
|
|
||||||
|
} : 5
|
||||||
|
|
||||||
|
RepSumByNameRound
|
|
@ -0,0 +1,116 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# ------------------------------------------
|
||||||
|
# compare.shingle.benchmark.jira.tables.pl
|
||||||
|
#
|
||||||
|
# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
|
||||||
|
# by shingle.bm2jira.pl (located in the same directory as this script), and
|
||||||
|
# outputs a third JIRA-formatted comparison table.
|
||||||
|
#
|
||||||
|
# The difference is calculated as a percentage:
|
||||||
|
#
|
||||||
|
# 100 * (unpatched-elapsed - patched-elapsed / patched-elapsed)
|
||||||
|
#
|
||||||
|
# where (un)patched-elapsed values have had the no-shingle-filter
|
||||||
|
# (StandardAnalyzer) elapsed time subtracted from them.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Example shingle.bm2jira.pl output:
|
||||||
|
# ----------------------------------
|
||||||
|
# JAVA:
|
||||||
|
# java version "1.5.0_15"
|
||||||
|
# Java(TM) 2 Runtime Environment, Standard Edition (build 1.5.0_15-b04)
|
||||||
|
# Java HotSpot(TM) 64-Bit Server VM (build 1.5.0_15-b04, mixed mode)
|
||||||
|
#
|
||||||
|
# OS:
|
||||||
|
# cygwin
|
||||||
|
# WinVistaService Pack 2
|
||||||
|
# Service Pack 26060022202561
|
||||||
|
#
|
||||||
|
# ||Max Shingle Size||Unigrams?||Elapsed||
|
||||||
|
# |1 (Unigrams)|yes|2.19s|
|
||||||
|
# |2|no|4.74s|
|
||||||
|
# |2|yes|4.90s|
|
||||||
|
# |4|no|5.82s|
|
||||||
|
# |4|yes|5.97s|
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
|
||||||
|
|
||||||
|
die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
|
||||||
|
|
||||||
|
my %stats = ();
|
||||||
|
|
||||||
|
open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
|
||||||
|
my $table_encountered = 0;
|
||||||
|
my $standard_analyzer_elapsed = 0;
|
||||||
|
my %unpatched_stats = ();
|
||||||
|
my %patched_stats = ();
|
||||||
|
while (<UNPATCHED>) {
|
||||||
|
unless ($table_encountered) {
|
||||||
|
if (/\Q||Max Shingle Size||Unigrams?||Elapsed||\E/) {
|
||||||
|
$table_encountered = 1;
|
||||||
|
} else {
|
||||||
|
print;
|
||||||
|
}
|
||||||
|
} elsif (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
|
||||||
|
my $max_shingle_size = $1;
|
||||||
|
my $output_unigrams = $2;
|
||||||
|
my $elapsed = $3;
|
||||||
|
if ($max_shingle_size =~ /Unigrams/) {
|
||||||
|
$standard_analyzer_elapsed = $elapsed;
|
||||||
|
} else {
|
||||||
|
$unpatched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close UNPATCHED;
|
||||||
|
|
||||||
|
open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
|
||||||
|
while (<PATCHED>) {
|
||||||
|
if (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
|
||||||
|
my $max_shingle_size = $1;
|
||||||
|
my $output_unigrams = $2;
|
||||||
|
my $elapsed = $3;
|
||||||
|
if ($max_shingle_size =~ /Unigrams/) {
|
||||||
|
$standard_analyzer_elapsed = $elapsed
|
||||||
|
if ($elapsed < $standard_analyzer_elapsed);
|
||||||
|
} else {
|
||||||
|
$patched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close PATCHED;
|
||||||
|
|
||||||
|
print "||Max Shingle Size||Unigrams?||Unpatched||Patched||StandardAnalyzer||Improvement||\n";
|
||||||
|
for my $max_shingle_size (sort { $a <=> $b } keys %unpatched_stats) {
|
||||||
|
for my $output_unigrams (sort keys %{$unpatched_stats{$max_shingle_size}}) {
|
||||||
|
my $improvement
|
||||||
|
= ( $unpatched_stats{$max_shingle_size}{$output_unigrams}
|
||||||
|
- $patched_stats{$max_shingle_size}{$output_unigrams})
|
||||||
|
/ ( $patched_stats{$max_shingle_size}{$output_unigrams}
|
||||||
|
- $standard_analyzer_elapsed);
|
||||||
|
$improvement = int($improvement * 1000 + .5) / 10; # Round and truncate
|
||||||
|
printf "|$max_shingle_size|$output_unigrams"
|
||||||
|
."|$unpatched_stats{$max_shingle_size}{$output_unigrams}s"
|
||||||
|
."|$patched_stats{$max_shingle_size}{$output_unigrams}s"
|
||||||
|
."|${standard_analyzer_elapsed}s|%2.1f%%|\n", $improvement;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# ----------
|
||||||
|
# shingle.bm2jira.pl
|
||||||
|
#
|
||||||
|
# Converts Lucene contrib-benchmark output produced using the
|
||||||
|
# conf/shingle.alg file into a JIRA-formatted table.
|
||||||
|
#
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
my %min_elapsed = ();
|
||||||
|
|
||||||
|
#Operation round runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
|
||||||
|
#BigramsAndUnigrams 0 1 255691 21,147.22 12.09 15,501,840 35,061,760
|
||||||
|
#BigramsOnly - - 0 - - 1 - - 127383 - 16,871.92 7.55 - 31,725,312 41,746,432
|
||||||
|
#FourgramsAndUnigrams
|
||||||
|
#FourgramsOnly
|
||||||
|
#UnigramsOnly
|
||||||
|
|
||||||
|
while (<>) {
|
||||||
|
if (/^((?:Uni|Bi|Four)grams\S+)[-\s]*([^\s{].*)/) {
|
||||||
|
my $operation = $1;
|
||||||
|
my $stats = $2;
|
||||||
|
my $max_shingle_size
|
||||||
|
= ($operation =~ /^Bigrams/ ? 2 : $operation =~ /^Unigrams/ ? 1 : 4);
|
||||||
|
my $output_unigrams
|
||||||
|
= ($operation =~ /(?:AndUnigrams|UnigramsOnly)$/ ? 'yes' : 'no');
|
||||||
|
my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
|
||||||
|
$min_elapsed{$max_shingle_size}{$output_unigrams} = $elapsed
|
||||||
|
unless (defined($min_elapsed{$max_shingle_size}{$output_unigrams})
|
||||||
|
&& $elapsed >= $min_elapsed{$max_shingle_size}{$output_unigrams});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Print out platform info
|
||||||
|
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
|
||||||
|
if ($^O =~ /win/i) {
|
||||||
|
print "$^O\n";
|
||||||
|
eval {
|
||||||
|
require Win32;
|
||||||
|
print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
|
||||||
|
};
|
||||||
|
die "Error loading Win32: $@" if ($@);
|
||||||
|
} else {
|
||||||
|
print `uname -a 2>&1`;
|
||||||
|
}
|
||||||
|
|
||||||
|
print "\n||Max Shingle Size||Unigrams?||Elapsed||\n";
|
||||||
|
|
||||||
|
for my $max_shingle_size (sort { $a <=> $b } keys %min_elapsed) {
|
||||||
|
for my $output_unigrams (sort keys %{$min_elapsed{$max_shingle_size}}) {
|
||||||
|
my $size = (1 == $max_shingle_size ? '1 (Unigrams)' : $max_shingle_size);
|
||||||
|
printf "|$size|$output_unigrams|\%2.2fs|\n",
|
||||||
|
$min_elapsed{$max_shingle_size}{$output_unigrams};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,117 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
|
||||||
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
|
||||||
|
* <p>
|
||||||
|
* <ul>
|
||||||
|
* <li> <code>NewShingleAnalyzer</code> (constructs with all defaults)
|
||||||
|
* <li> <code>NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)</code>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class NewShingleAnalyzerTask extends PerfTask {
|
||||||
|
|
||||||
|
private String analyzerClassName = "standard.StandardAnalyzer";
|
||||||
|
private static final String shingleAnalyzerClassName
|
||||||
|
= "org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper";
|
||||||
|
private int maxShingleSize = 2;
|
||||||
|
private boolean outputUnigrams = true;
|
||||||
|
|
||||||
|
public NewShingleAnalyzerTask(PerfRunData runData) {
|
||||||
|
super(runData);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setAnalyzer() throws Exception {
|
||||||
|
Class<? extends Analyzer> clazz = null;
|
||||||
|
Analyzer wrappedAnalyzer;
|
||||||
|
try {
|
||||||
|
if (analyzerClassName == null || analyzerClassName.equals("")) {
|
||||||
|
analyzerClassName
|
||||||
|
= "org.apache.lucene.analysis.standard.StandardAnalyzer";
|
||||||
|
}
|
||||||
|
if (analyzerClassName.indexOf(".") == -1
|
||||||
|
|| analyzerClassName.startsWith("standard.")) {
|
||||||
|
//there is no package name, assume o.a.l.analysis
|
||||||
|
analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
|
||||||
|
}
|
||||||
|
clazz = Class.forName(analyzerClassName).asSubclass(Analyzer.class);
|
||||||
|
// first try to use a ctor with version parameter (needed for many new
|
||||||
|
// Analyzers that have no default one anymore)
|
||||||
|
Constructor<? extends Analyzer> ctor = clazz.getConstructor(Version.class);
|
||||||
|
wrappedAnalyzer = ctor.newInstance(Version.LUCENE_CURRENT);
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
// otherwise use default ctor
|
||||||
|
wrappedAnalyzer = clazz.newInstance();
|
||||||
|
}
|
||||||
|
ShingleAnalyzerWrapper analyzer
|
||||||
|
= new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
|
||||||
|
analyzer.setOutputUnigrams(outputUnigrams);
|
||||||
|
getRunData().setAnalyzer(analyzer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int doLogic() throws Exception {
|
||||||
|
try {
|
||||||
|
setAnalyzer();
|
||||||
|
System.out.println
|
||||||
|
("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over"
|
||||||
|
+ analyzerClassName);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Error creating Analyzer", e);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setParams(String params) {
|
||||||
|
super.setParams(params);
|
||||||
|
StringTokenizer st = new StringTokenizer(params, ",");
|
||||||
|
while (st.hasMoreTokens()) {
|
||||||
|
String param = st.nextToken();
|
||||||
|
StringTokenizer expr = new StringTokenizer(param, ":");
|
||||||
|
String key = expr.nextToken();
|
||||||
|
String value = expr.nextToken();
|
||||||
|
if (key.equalsIgnoreCase("analyzer")) {
|
||||||
|
analyzerClassName = value;
|
||||||
|
} else if (key.equalsIgnoreCase("outputUnigrams")) {
|
||||||
|
outputUnigrams = Boolean.parseBoolean(value);
|
||||||
|
} else if (key.equalsIgnoreCase("maxShingleSize")) {
|
||||||
|
maxShingleSize = (int)Double.parseDouble(value);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Unknown parameter " + param);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supportsParams() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -975,6 +975,79 @@ public class TestPerfTasksLogic extends LuceneTestCase {
|
||||||
return algLines;
|
return algLines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that we can create ShingleAnalyzerWrappers.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzer() throws Exception {
|
||||||
|
String text = "one,two,three, four five six";
|
||||||
|
|
||||||
|
// Default analyzer, maxShingleSize, and outputUnigrams
|
||||||
|
Benchmark benchmark = execBenchmark(getShingleConfig(""));
|
||||||
|
TokenStream stream = benchmark.getRunData().getAnalyzer().tokenStream
|
||||||
|
("bogus", new StringReader(text));
|
||||||
|
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
|
||||||
|
new String[] {"one", "one two", "two", "two three",
|
||||||
|
"three", "three four", "four", "four five",
|
||||||
|
"five", "five six", "six"});
|
||||||
|
// Default analyzer, maxShingleSize = 3, and outputUnigrams = false
|
||||||
|
benchmark = execBenchmark
|
||||||
|
(getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
|
||||||
|
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
|
||||||
|
new String[] { "one two", "one two three", "two three",
|
||||||
|
"two three four", "three four",
|
||||||
|
"three four five", "four five",
|
||||||
|
"four five six", "five six" });
|
||||||
|
// WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
|
||||||
|
benchmark = execBenchmark
|
||||||
|
(getShingleConfig("analyzer:WhitespaceAnalyzer"));
|
||||||
|
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
|
||||||
|
new String[] { "one,two,three,", "one,two,three, four",
|
||||||
|
"four", "four five", "five", "five six",
|
||||||
|
"six" });
|
||||||
|
|
||||||
|
// WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
|
||||||
|
benchmark = execBenchmark
|
||||||
|
(getShingleConfig
|
||||||
|
("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
|
||||||
|
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
|
||||||
|
new String[] { "one,two,three, four",
|
||||||
|
"one,two,three, four five",
|
||||||
|
"four five", "four five six",
|
||||||
|
"five six" });
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertEqualShingle
|
||||||
|
(Analyzer analyzer, String text, String[] expected) throws Exception {
|
||||||
|
TokenStream stream = analyzer.tokenStream("bogus", new StringReader(text));
|
||||||
|
stream.reset();
|
||||||
|
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
||||||
|
int termNum = 0;
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
assertTrue("Extra output term(s), starting with '"
|
||||||
|
+ new String(termAtt.termBuffer(), 0, termAtt.termLength()) + "'",
|
||||||
|
termNum < expected.length);
|
||||||
|
assertEquals("Mismatch in output term # " + termNum + " - ",
|
||||||
|
expected[termNum],
|
||||||
|
new String(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||||
|
++termNum;
|
||||||
|
}
|
||||||
|
assertEquals("Too few output terms", expected.length, termNum);
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String[] getShingleConfig(String params) {
|
||||||
|
String algLines[] = {
|
||||||
|
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||||
|
"docs.file=" + getReuters20LinesFile(),
|
||||||
|
"content.source.forever=false",
|
||||||
|
"directory=RAMDirectory",
|
||||||
|
"NewShingleAnalyzer(" + params + ")",
|
||||||
|
"CreateIndex",
|
||||||
|
"{ \"AddDocs\" AddDoc > : * "
|
||||||
|
};
|
||||||
|
return algLines;
|
||||||
|
}
|
||||||
|
|
||||||
private static String getReuters20LinesFile() {
|
private static String getReuters20LinesFile() {
|
||||||
return System.getProperty("lucene.common.dir").replace('\\','/') +
|
return System.getProperty("lucene.common.dir").replace('\\','/') +
|
||||||
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";
|
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";
|
||||||
|
|
Loading…
Reference in New Issue