mirror of https://github.com/apache/lucene.git
LUCENE-1128 and 1129: Add highlighting support to benchmarking, plus fix minor traversalSize bug in ReadTask, also added a few new algorithms to try out
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@614885 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f75f490eb9
commit
1183763dbe
|
@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
|
||||
1/23/2008
|
||||
LUCENE-1129: ReadTask properly uses the traversalSize value
|
||||
LUCENE-1128: Added support for benchmarking the highlighter
|
||||
|
||||
01/20/08
|
||||
LUCENE-1139: various fixes
|
||||
- add merge.scheduler, merge.policy config properties
|
||||
|
@ -12,6 +17,7 @@ $Id:$
|
|||
- OptimizeTask now takes int param to call optimize(int maxNumSegments)
|
||||
- CloseIndexTask now takes bool param to call close(false) (abort running merges)
|
||||
|
||||
|
||||
01/03/08
|
||||
LUCENE-1116: quality package improvements:
|
||||
- add MRR computation;
|
||||
|
|
|
@ -109,6 +109,7 @@
|
|||
<path id="classpath">
|
||||
<pathelement path="${common.dir}/build/classes/java"/>
|
||||
<pathelement path="${common.dir}/build/classes/demo"/>
|
||||
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
||||
<pathelement path="${basedir}/lib/${digester.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${collections.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${logging.jar}"/>
|
||||
|
@ -163,9 +164,14 @@
|
|||
<subant target="compile-demo">
|
||||
<fileset dir="${common.dir}" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
</target>
|
||||
<target name="compile-highlighter">
|
||||
<subant target="compile">
|
||||
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="init" depends="common.init,compile-demo,check-files"/>
|
||||
<target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>
|
||||
|
||||
<!-- make sure online collections (reuters) are first downloaded -->
|
||||
<target name="test" depends="init,get-files">
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
doc.add.log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
Optimize
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
|
||||
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
|
||||
|
||||
CloseReader
|
||||
|
||||
RepSumByPref MAddDocs
|
||||
|
||||
NewRound
|
||||
|
||||
} : 4
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
|
@ -0,0 +1,69 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.term.vector.offsets=false
|
||||
doc.term.vector.positions=false
|
||||
doc.add.log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
Optimize
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
OpenReader
|
||||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
||||
CloseReader
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
|
||||
CloseReader
|
||||
|
||||
RepSumByPref SearchHlgtSameRdr
|
||||
|
||||
NewRound
|
||||
|
||||
} : 2
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
|
@ -0,0 +1,69 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
doc.add.log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
Optimize
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
OpenReader
|
||||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
||||
CloseReader
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
|
||||
CloseReader
|
||||
|
||||
RepSumByPref SearchHlgtSameRdr
|
||||
|
||||
NewRound
|
||||
|
||||
} : 2
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
|
@ -247,6 +247,10 @@ The following is an informal description of the supported syntax.
|
|||
<li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string
|
||||
parameter: a comma separated list of Fields to load.
|
||||
</li>
|
||||
<li><b>SearchTravRetHighlighterTask</b> takes a string
|
||||
parameter: a comma separated list of parameters to define highlighting. See that
|
||||
tasks javadocs for more information
|
||||
</li>
|
||||
</ul>
|
||||
<br>Example - <font color="#FF0066">AddDoc(2000)</font> - would add a document
|
||||
of size 2000 (~bytes).
|
||||
|
|
|
@ -17,26 +17,31 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Read index (abstract) task.
|
||||
* Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve()
|
||||
* methods to configure the actual action.
|
||||
*
|
||||
* <p>Note: All ReadTasks reuse the reader if it is already open.
|
||||
* <p/>
|
||||
* <p>Note: All ReadTasks reuse the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
*
|
||||
* <p/>
|
||||
* <p>Other side effects: none.
|
||||
*/
|
||||
public abstract class ReadTask extends PerfTask {
|
||||
|
@ -48,7 +53,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
public int doLogic() throws Exception {
|
||||
int res = 0;
|
||||
boolean closeReader = false;
|
||||
|
||||
|
||||
// open reader or use existing one
|
||||
IndexReader ir = getRunData().getIndexReader();
|
||||
if (ir == null) {
|
||||
|
@ -57,18 +62,18 @@ public abstract class ReadTask extends PerfTask {
|
|||
closeReader = true;
|
||||
//res++; //this is confusing, comment it out
|
||||
}
|
||||
|
||||
|
||||
// optionally warm and add num docs traversed to count
|
||||
if (withWarm()) {
|
||||
Document doc = null;
|
||||
for (int m = 0; m < ir.maxDoc(); m++) {
|
||||
if (!ir.isDeleted(m)) {
|
||||
doc = ir.document(m);
|
||||
res += (doc==null ? 0 : 1);
|
||||
res += (doc == null ? 0 : 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (withSearch()) {
|
||||
res++;
|
||||
IndexSearcher searcher = new IndexSearcher(ir);
|
||||
|
@ -76,32 +81,53 @@ public abstract class ReadTask extends PerfTask {
|
|||
Query q = queryMaker.makeQuery();
|
||||
Hits hits = searcher.search(q);
|
||||
//System.out.println("searched: "+q);
|
||||
|
||||
if (withTraverse() && hits!=null) {
|
||||
|
||||
if (withTraverse() && hits != null) {
|
||||
int traversalSize = Math.min(hits.length(), traversalSize());
|
||||
if (traversalSize > 0) {
|
||||
boolean retrieve = withRetrieve();
|
||||
for (int m = 0; m < hits.length(); m++) {
|
||||
int numHighlight = Math.min(numToHighlight(), hits.length());
|
||||
Analyzer analyzer = getRunData().getAnalyzer();
|
||||
Highlighter highlighter = null;
|
||||
int maxFrags = 1;
|
||||
if (numHighlight > 0) {
|
||||
highlighter = getHighlighter(q);
|
||||
maxFrags = maxNumFragments();
|
||||
}
|
||||
boolean merge = isMergeContiguousFragments();
|
||||
for (int m = 0; m < traversalSize; m++) {
|
||||
int id = hits.id(m);
|
||||
res++;
|
||||
if (retrieve) {
|
||||
res += retrieveDoc(ir, id);
|
||||
Document document = retrieveDoc(ir, id);
|
||||
res += document != null ? 1 : 0;
|
||||
if (numHighlight > 0 && m < numHighlight) {
|
||||
Collection/*<String>*/ fieldsToHighlight = getFieldsToHighlight(document);
|
||||
for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
|
||||
String field = (String) iterator.next();
|
||||
String text = document.get(field);
|
||||
TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
|
||||
res += doHighlight(ts, text, highlighter, merge, maxFrags);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
searcher.close();
|
||||
}
|
||||
|
||||
|
||||
if (closeReader) {
|
||||
ir.close();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
return (ir.document(id) == null ? 0 : 1);
|
||||
|
||||
|
||||
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
return ir.document(id);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,33 +138,82 @@ public abstract class ReadTask extends PerfTask {
|
|||
/**
|
||||
* Return true if search should be performed.
|
||||
*/
|
||||
public abstract boolean withSearch ();
|
||||
public abstract boolean withSearch();
|
||||
|
||||
/**
|
||||
* Return true if warming should be performed.
|
||||
*/
|
||||
public abstract boolean withWarm ();
|
||||
|
||||
public abstract boolean withWarm();
|
||||
|
||||
/**
|
||||
* Return true if, with search, results should be traversed.
|
||||
*/
|
||||
public abstract boolean withTraverse ();
|
||||
public abstract boolean withTraverse();
|
||||
|
||||
/**
|
||||
* Specify the number of hits to traverse. Tasks should override this if they want to restrict the number
|
||||
* of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
|
||||
*
|
||||
* <p/>
|
||||
* Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
|
||||
*
|
||||
* @return Integer.MAX_VALUE
|
||||
*/
|
||||
public int traversalSize()
|
||||
{
|
||||
public int traversalSize() {
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if, with search & results traversing, docs should be retrieved.
|
||||
*/
|
||||
public abstract boolean withRetrieve ();
|
||||
public abstract boolean withRetrieve();
|
||||
|
||||
/**
|
||||
* Set to the number of documents to highlight.
|
||||
*
|
||||
* @return The number of the results to highlight. O means no docs will be highlighted.
|
||||
*/
|
||||
public int numToHighlight() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
protected Highlighter getHighlighter(Query q){
|
||||
return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return the maxiumum number of highlighter fragments
|
||||
*/
|
||||
public int maxNumFragments(){
|
||||
return 10;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return true if the highlighter should merge contiguous fragments
|
||||
*/
|
||||
public boolean isMergeContiguousFragments(){
|
||||
return false;
|
||||
}
|
||||
|
||||
protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
|
||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
|
||||
return frag != null ? frag.length : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define the fields to highlight. Base implementation returns all fields
|
||||
* @param document The Document
|
||||
* @return A Collection of Field names (Strings)
|
||||
*/
|
||||
protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
|
||||
List/*<Fieldable>*/ fieldables = document.getFields();
|
||||
Set/*<String>*/ result = new HashSet(fieldables.size());
|
||||
for (Iterator iterator = fieldables.iterator(); iterator.hasNext();) {
|
||||
Fieldable fieldable = (Fieldable) iterator.next();
|
||||
result.add(fieldable.name());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
||||
*
|
||||
* Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
|
||||
*
|
||||
* <p>Note: This task reuses the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
* </p>
|
||||
*
|
||||
* <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
|
||||
* <ul>
|
||||
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
|
||||
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
|
||||
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
||||
* <li>mergeContiguous - true if contiguous fragments should be merged.</li>
|
||||
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
||||
* </ul>
|
||||
* Example:
|
||||
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
* </pre>
|
||||
*
|
||||
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well.
|
||||
*
|
||||
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
||||
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
||||
*/
|
||||
public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||
|
||||
protected int numToHighlight = Integer.MAX_VALUE;
|
||||
protected boolean mergeContiguous;
|
||||
protected int maxFrags = 2;
|
||||
protected Set paramFields = Collections.EMPTY_SET;
|
||||
|
||||
|
||||
public SearchTravRetHighlightTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
//check to make sure either the doc is being stored
|
||||
PerfRunData data = getRunData();
|
||||
if (data.getConfig().get("doc.stored", false) == false){
|
||||
throw new Exception("doc.stored must be set to true");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean withRetrieve() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public int numToHighlight() {
|
||||
return numToHighlight;
|
||||
}
|
||||
|
||||
public boolean isMergeContiguousFragments() {
|
||||
return mergeContiguous;
|
||||
}
|
||||
|
||||
public int maxNumFragments() {
|
||||
return maxFrags;
|
||||
}
|
||||
|
||||
protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
|
||||
Collection result = super.getFieldsToHighlight(document);
|
||||
//if stored is false, then result will be empty, in which case just get all the param fields
|
||||
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
|
||||
result.retainAll(paramFields);
|
||||
} else {
|
||||
result = paramFields;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setParams(String params) {
|
||||
String [] splits = params.split(",");
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
if (splits[i].startsWith("size[") == true){
|
||||
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("highlight[") == true){
|
||||
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("maxFrags[") == true){
|
||||
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("mergeContiguous[") == true){
|
||||
mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
|
||||
} else if (splits[i].startsWith("fields[") == true){
|
||||
paramFields = new HashSet();
|
||||
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
|
||||
String [] fieldSplits = fieldNames.split(";");
|
||||
for (int j = 0; j < fieldSplits.length; j++) {
|
||||
paramFields.add(fieldSplits[j]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.SetBasedFieldSelector;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.util.StringTokenizer;
|
||||
|
@ -51,8 +52,8 @@ public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
|
|||
}
|
||||
|
||||
|
||||
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
return (ir.document(id, fieldSelector) == null ? 0 : 1);
|
||||
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
return ir.document(id, fieldSelector);
|
||||
}
|
||||
|
||||
public void setParams(String params) {
|
||||
|
|
|
@ -120,7 +120,7 @@ public class Algorithm {
|
|||
if ((char)stok.ttype == '*') {
|
||||
((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
|
||||
} else {
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected repetitions number: - "+stok.toString());
|
||||
((TaskSequence)prevTask).setRepetitions((int)stok.nval);
|
||||
}
|
||||
// check for rate specification (ops/min)
|
||||
|
@ -130,7 +130,7 @@ public class Algorithm {
|
|||
} else {
|
||||
// get rate number
|
||||
stok.nextToken();
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted rate number: - "+stok.toString());
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected rate number: - "+stok.toString());
|
||||
// check for unit - min or sec, sec is default
|
||||
stok.nextToken();
|
||||
if (stok.ttype!='/') {
|
||||
|
@ -138,14 +138,14 @@ public class Algorithm {
|
|||
((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
|
||||
} else {
|
||||
stok.nextToken();
|
||||
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString());
|
||||
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
||||
String unit = stok.sval.toLowerCase();
|
||||
if ("min".equals(unit)) {
|
||||
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
|
||||
} else if ("sec".equals(unit)) {
|
||||
((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
|
||||
} else {
|
||||
throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString());
|
||||
throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,9 @@ import java.util.Iterator;
|
|||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -94,6 +96,109 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
ir.close();
|
||||
}
|
||||
|
||||
public void testHighlighting() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 1000",
|
||||
"Optimize",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
|
||||
// 4. test specific checks after the benchmark run completed.
|
||||
assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
|
||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
||||
//we probably should use a different doc/query maker, but...
|
||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
||||
|
||||
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
|
||||
// now we should be able to open the index for write.
|
||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
|
||||
iw.close();
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testHighlightingTV() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",//doc storage is required in order to have text to highlight
|
||||
"doc.term.vector.offsets=true",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 1000",
|
||||
"Optimize",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
|
||||
// 4. test specific checks after the benchmark run completed.
|
||||
assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
|
||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
||||
//we probably should use a different doc/query maker, but...
|
||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
||||
|
||||
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
|
||||
// now we should be able to open the index for write.
|
||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
|
||||
iw.close();
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testHighlightingNoTvNoStore() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=false",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 1000",
|
||||
"Optimize",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
try {
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
assertTrue("CountingHighlighterTest should have thrown an exception", false);
|
||||
} catch (Exception e) {
|
||||
assertTrue(true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Exhasting Doc Maker logic
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.TextFragment;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Test Search task which counts number of searches.
|
||||
*/
|
||||
public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
|
||||
|
||||
public static int numHighlightedResults = 0;
|
||||
public static int numDocsRetrieved = 0;
|
||||
|
||||
public CountingHighlighterTestTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
Document document = ir.document(id);
|
||||
if (document != null) {
|
||||
numDocsRetrieved++;
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
|
||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
|
||||
numHighlightedResults += frag != null ? frag.length : 0;
|
||||
return frag != null ? frag.length : 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue