LUCENE-1128 and 1129: Add highlighting support to benchmarking, plus fix minor traversalSize bug in ReadTask, also added a few new algorithms to try out

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@614885 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-01-24 14:39:44 +00:00
parent f75f490eb9
commit 1183763dbe
12 changed files with 616 additions and 31 deletions

View File

@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$ $Id:$
1/23/2008
LUCENE-1129: ReadTask properly uses the traversalSize value
LUCENE-1128: Added support for benchmarking the highlighter
01/20/08 01/20/08
LUCENE-1139: various fixes LUCENE-1139: various fixes
- add merge.scheduler, merge.policy config properties - add merge.scheduler, merge.policy config properties
@ -12,6 +17,7 @@ $Id:$
- OptimizeTask now takes int param to call optimize(int maxNumSegments) - OptimizeTask now takes int param to call optimize(int maxNumSegments)
- CloseIndexTask now takes bool param to call close(false) (abort running merges) - CloseIndexTask now takes bool param to call close(false) (abort running merges)
01/03/08 01/03/08
LUCENE-1116: quality package improvements: LUCENE-1116: quality package improvements:
- add MRR computation; - add MRR computation;

View File

@ -109,6 +109,7 @@
<path id="classpath"> <path id="classpath">
<pathelement path="${common.dir}/build/classes/java"/> <pathelement path="${common.dir}/build/classes/java"/>
<pathelement path="${common.dir}/build/classes/demo"/> <pathelement path="${common.dir}/build/classes/demo"/>
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
<pathelement path="${basedir}/lib/${digester.jar}"/> <pathelement path="${basedir}/lib/${digester.jar}"/>
<pathelement path="${basedir}/lib/${collections.jar}"/> <pathelement path="${basedir}/lib/${collections.jar}"/>
<pathelement path="${basedir}/lib/${logging.jar}"/> <pathelement path="${basedir}/lib/${logging.jar}"/>
@ -163,9 +164,14 @@
<subant target="compile-demo"> <subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/> <fileset dir="${common.dir}" includes="build.xml"/>
</subant> </subant>
</target> </target>
<target name="compile-highlighter">
<subant target="compile">
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
</subant>
</target>
<target name="init" depends="common.init,compile-demo,check-files"/> <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>
<!-- make sure online collections (reuters) are first downloaded --> <!-- make sure online collections (reuters) are first downloaded -->
<target name="test" depends="init,get-files"> <target name="test" depends="init,get-files">

View File

@ -0,0 +1,68 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true
doc.add.log.step=2000
docs.dir=reuters-out
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc } : 20000
Optimize
CloseIndex
}
{ "Rounds"
ResetSystemSoft
OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
CloseReader
RepSumByPref MAddDocs
NewRound
} : 4
RepSumByNameRound
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -0,0 +1,69 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=false
doc.term.vector.offsets=false
doc.term.vector.positions=false
doc.add.log.step=2000
docs.dir=reuters-out
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc } : 20000
Optimize
CloseIndex
}
{ "Rounds"
ResetSystemSoft
OpenReader
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
CloseReader
OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
CloseReader
RepSumByPref SearchHlgtSameRdr
NewRound
} : 2
RepSumByNameRound
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -0,0 +1,69 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true
doc.add.log.step=2000
docs.dir=reuters-out
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc } : 20000
Optimize
CloseIndex
}
{ "Rounds"
ResetSystemSoft
OpenReader
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
CloseReader
OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
CloseReader
RepSumByPref SearchHlgtSameRdr
NewRound
} : 2
RepSumByNameRound
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -247,6 +247,10 @@ The following is an informal description of the supported syntax.
<li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string <li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string
parameter: a comma separated list of Fields to load. parameter: a comma separated list of Fields to load.
</li> </li>
<li><b>SearchTravRetHighlighterTask</b> takes a string
parameter: a comma separated list of parameters to define highlighting. See that
tasks javadocs for more information
</li>
</ul> </ul>
<br>Example - <font color="#FF0066">AddDoc(2000)</font> - would add a document <br>Example - <font color="#FF0066">AddDoc(2000)</font> - would add a document
of size 2000 (~bytes). of size 2000 (~bytes).

View File

@ -17,26 +17,31 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Hits; import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import java.io.IOException; import java.io.IOException;
import java.util.*;
/** /**
* Read index (abstract) task. * Read index (abstract) task.
* Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve() * Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve()
* methods to configure the actual action. * methods to configure the actual action.
* * <p/>
* <p>Note: All ReadTasks reuse the reader if it is already open. * <p>Note: All ReadTasks reuse the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end. * Otherwise a reader is opened at start and closed at the end.
* * <p/>
* <p>Other side effects: none. * <p>Other side effects: none.
*/ */
public abstract class ReadTask extends PerfTask { public abstract class ReadTask extends PerfTask {
@ -48,7 +53,7 @@ public abstract class ReadTask extends PerfTask {
public int doLogic() throws Exception { public int doLogic() throws Exception {
int res = 0; int res = 0;
boolean closeReader = false; boolean closeReader = false;
// open reader or use existing one // open reader or use existing one
IndexReader ir = getRunData().getIndexReader(); IndexReader ir = getRunData().getIndexReader();
if (ir == null) { if (ir == null) {
@ -57,18 +62,18 @@ public abstract class ReadTask extends PerfTask {
closeReader = true; closeReader = true;
//res++; //this is confusing, comment it out //res++; //this is confusing, comment it out
} }
// optionally warm and add num docs traversed to count // optionally warm and add num docs traversed to count
if (withWarm()) { if (withWarm()) {
Document doc = null; Document doc = null;
for (int m = 0; m < ir.maxDoc(); m++) { for (int m = 0; m < ir.maxDoc(); m++) {
if (!ir.isDeleted(m)) { if (!ir.isDeleted(m)) {
doc = ir.document(m); doc = ir.document(m);
res += (doc==null ? 0 : 1); res += (doc == null ? 0 : 1);
} }
} }
} }
if (withSearch()) { if (withSearch()) {
res++; res++;
IndexSearcher searcher = new IndexSearcher(ir); IndexSearcher searcher = new IndexSearcher(ir);
@ -76,32 +81,53 @@ public abstract class ReadTask extends PerfTask {
Query q = queryMaker.makeQuery(); Query q = queryMaker.makeQuery();
Hits hits = searcher.search(q); Hits hits = searcher.search(q);
//System.out.println("searched: "+q); //System.out.println("searched: "+q);
if (withTraverse() && hits!=null) { if (withTraverse() && hits != null) {
int traversalSize = Math.min(hits.length(), traversalSize()); int traversalSize = Math.min(hits.length(), traversalSize());
if (traversalSize > 0) { if (traversalSize > 0) {
boolean retrieve = withRetrieve(); boolean retrieve = withRetrieve();
for (int m = 0; m < hits.length(); m++) { int numHighlight = Math.min(numToHighlight(), hits.length());
Analyzer analyzer = getRunData().getAnalyzer();
Highlighter highlighter = null;
int maxFrags = 1;
if (numHighlight > 0) {
highlighter = getHighlighter(q);
maxFrags = maxNumFragments();
}
boolean merge = isMergeContiguousFragments();
for (int m = 0; m < traversalSize; m++) {
int id = hits.id(m); int id = hits.id(m);
res++; res++;
if (retrieve) { if (retrieve) {
res += retrieveDoc(ir, id); Document document = retrieveDoc(ir, id);
res += document != null ? 1 : 0;
if (numHighlight > 0 && m < numHighlight) {
Collection/*<String>*/ fieldsToHighlight = getFieldsToHighlight(document);
for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
String field = (String) iterator.next();
String text = document.get(field);
TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
res += doHighlight(ts, text, highlighter, merge, maxFrags);
}
}
} }
} }
} }
} }
searcher.close(); searcher.close();
} }
if (closeReader) { if (closeReader) {
ir.close(); ir.close();
} }
return res; return res;
} }
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id) == null ? 0 : 1);
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
return ir.document(id);
} }
/** /**
@ -112,33 +138,82 @@ public abstract class ReadTask extends PerfTask {
/** /**
* Return true if search should be performed. * Return true if search should be performed.
*/ */
public abstract boolean withSearch (); public abstract boolean withSearch();
/** /**
* Return true if warming should be performed. * Return true if warming should be performed.
*/ */
public abstract boolean withWarm (); public abstract boolean withWarm();
/** /**
* Return true if, with search, results should be traversed. * Return true if, with search, results should be traversed.
*/ */
public abstract boolean withTraverse (); public abstract boolean withTraverse();
/** /**
* Specify the number of hits to traverse. Tasks should override this if they want to restrict the number * Specify the number of hits to traverse. Tasks should override this if they want to restrict the number
* of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0. * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
* * <p/>
* Read task calculates the traversal as: Math.min(hits.length(), traversalSize()) * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
*
* @return Integer.MAX_VALUE * @return Integer.MAX_VALUE
*/ */
public int traversalSize() public int traversalSize() {
{
return Integer.MAX_VALUE; return Integer.MAX_VALUE;
} }
/** /**
* Return true if, with search & results traversing, docs should be retrieved. * Return true if, with search & results traversing, docs should be retrieved.
*/ */
public abstract boolean withRetrieve (); public abstract boolean withRetrieve();
/**
* Set to the number of documents to highlight.
*
* @return The number of the results to highlight. O means no docs will be highlighted.
*/
public int numToHighlight() {
return 0;
}
protected Highlighter getHighlighter(Query q){
return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
}
/**
*
* @return the maxiumum number of highlighter fragments
*/
public int maxNumFragments(){
return 10;
}
/**
*
* @return true if the highlighter should merge contiguous fragments
*/
public boolean isMergeContiguousFragments(){
return false;
}
protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
return frag != null ? frag.length : 0;
}
/**
* Define the fields to highlight. Base implementation returns all fields
* @param document The Document
* @return A Collection of Field names (Strings)
*/
protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
List/*<Fieldable>*/ fieldables = document.getFields();
Set/*<String>*/ result = new HashSet(fieldables.size());
for (Iterator iterator = fieldables.iterator(); iterator.hasNext();) {
Fieldable fieldable = (Fieldable) iterator.next();
result.add(fieldable.name());
}
return result;
}
} }

View File

@ -0,0 +1,126 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
import java.util.Set;
import java.util.Collection;
import java.util.HashSet;
import java.util.Collections;
/**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
*
* Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
*
* <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end.
* </p>
*
* <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p>
* <ul>
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
* <li>mergeContiguous - true if contiguous fragments should be merged.</li>
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
* </ul>
* Example:
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
* </pre>
*
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well.
*
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
*/
public class SearchTravRetHighlightTask extends SearchTravTask {
protected int numToHighlight = Integer.MAX_VALUE;
protected boolean mergeContiguous;
protected int maxFrags = 2;
protected Set paramFields = Collections.EMPTY_SET;
public SearchTravRetHighlightTask(PerfRunData runData) {
super(runData);
}
public void setup() throws Exception {
super.setup();
//check to make sure either the doc is being stored
PerfRunData data = getRunData();
if (data.getConfig().get("doc.stored", false) == false){
throw new Exception("doc.stored must be set to true");
}
}
public boolean withRetrieve() {
return true;
}
public int numToHighlight() {
return numToHighlight;
}
public boolean isMergeContiguousFragments() {
return mergeContiguous;
}
public int maxNumFragments() {
return maxFrags;
}
protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
Collection result = super.getFieldsToHighlight(document);
//if stored is false, then result will be empty, in which case just get all the param fields
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
result.retainAll(paramFields);
} else {
result = paramFields;
}
return result;
}
public void setParams(String params) {
String [] splits = params.split(",");
for (int i = 0; i < splits.length; i++) {
if (splits[i].startsWith("size[") == true){
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("highlight[") == true){
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("maxFrags[") == true){
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("mergeContiguous[") == true){
mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
} else if (splits[i].startsWith("fields[") == true){
paramFields = new HashSet();
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
String [] fieldSplits = fieldNames.split(";");
for (int j = 0; j < fieldSplits.length; j++) {
paramFields.add(fieldSplits[j]);
}
}
}
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import java.util.StringTokenizer; import java.util.StringTokenizer;
@ -51,8 +52,8 @@ public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
} }
protected int retrieveDoc(IndexReader ir, int id) throws IOException { protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id, fieldSelector) == null ? 0 : 1); return ir.document(id, fieldSelector);
} }
public void setParams(String params) { public void setParams(String params) {

View File

@ -120,7 +120,7 @@ public class Algorithm {
if ((char)stok.ttype == '*') { if ((char)stok.ttype == '*') {
((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST); ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
} else { } else {
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString()); if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected repetitions number: - "+stok.toString());
((TaskSequence)prevTask).setRepetitions((int)stok.nval); ((TaskSequence)prevTask).setRepetitions((int)stok.nval);
} }
// check for rate specification (ops/min) // check for rate specification (ops/min)
@ -130,7 +130,7 @@ public class Algorithm {
} else { } else {
// get rate number // get rate number
stok.nextToken(); stok.nextToken();
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted rate number: - "+stok.toString()); if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected rate number: - "+stok.toString());
// check for unit - min or sec, sec is default // check for unit - min or sec, sec is default
stok.nextToken(); stok.nextToken();
if (stok.ttype!='/') { if (stok.ttype!='/') {
@ -138,14 +138,14 @@ public class Algorithm {
((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
} else { } else {
stok.nextToken(); stok.nextToken();
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString()); if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
String unit = stok.sval.toLowerCase(); String unit = stok.sval.toLowerCase();
if ("min".equals(unit)) { if ("min".equals(unit)) {
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
} else if ("sec".equals(unit)) { } else if ("sec".equals(unit)) {
((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
} else { } else {
throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString()); throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
} }
} }
} }

View File

@ -27,7 +27,9 @@ import java.util.Iterator;
import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
@ -94,6 +96,109 @@ public class TestPerfTasksLogic extends TestCase {
ir.close(); ir.close();
} }
public void testHighlighting() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",
"doc.maker="+Reuters20DocMaker.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"Optimize",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
ir.close();
}
public void testHighlightingTV() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",//doc storage is required in order to have text to highlight
"doc.term.vector.offsets=true",
"doc.maker="+Reuters20DocMaker.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"Optimize",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
ir.close();
}
public void testHighlightingNoTvNoStore() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=false",
"doc.maker="+Reuters20DocMaker.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"Optimize",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
try {
Benchmark benchmark = execBenchmark(algLines);
assertTrue("CountingHighlighterTest should have thrown an exception", false);
} catch (Exception e) {
assertTrue(true);
}
}
/** /**
* Test Exhasting Doc Maker logic * Test Exhasting Doc Maker logic
*/ */

View File

@ -0,0 +1,56 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/**
* Test Search task which counts number of searches.
*/
public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
public static int numHighlightedResults = 0;
public static int numDocsRetrieved = 0;
public CountingHighlighterTestTask(PerfRunData runData) {
super(runData);
}
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
Document document = ir.document(id);
if (document != null) {
numDocsRetrieved++;
}
return document;
}
protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
numHighlightedResults += frag != null ? frag.length : 0;
return frag != null ? frag.length : 0;
}
}