LUCENE-1128 and 1129: Add highlighting support to benchmarking, plus fix minor traversalSize bug in ReadTask, also added a few new algorithms to try out

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@614885 13f79535-47bb-0310-9956-ffa450edef68
2008-01-24 14:39:44 +00:00 · 2008-01-24 14:39:44 +00:00 · 1183763dbe
parent f75f490eb9
commit 1183763dbe
12 changed files with 616 additions and 31 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+
+1/23/2008
+  LUCENE-1129: ReadTask properly uses the traversalSize value
+  LUCENE-1128: Added support for benchmarking the highlighter
+
 01/20/08
  LUCENE-1139: various fixes
  - add merge.scheduler, merge.policy config properties
@ -12,6 +17,7 @@ $Id:$
  - OptimizeTask now takes int param to call optimize(int maxNumSegments)
  - CloseIndexTask now takes bool param to call close(false) (abort running merges)

+
 01/03/08
  LUCENE-1116: quality package improvements:
  - add MRR computation; 
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -109,6 +109,7 @@
    <path id="classpath">
        <pathelement path="${common.dir}/build/classes/java"/>
        <pathelement path="${common.dir}/build/classes/demo"/>
+        <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
        <pathelement path="${basedir}/lib/${digester.jar}"/>
        <pathelement path="${basedir}/lib/${collections.jar}"/>
        <pathelement path="${basedir}/lib/${logging.jar}"/>
@ -163,9 +164,14 @@
      <subant target="compile-demo">
         <fileset dir="${common.dir}" includes="build.xml"/>
      </subant>
-    </target> 
+    </target>
+    <target name="compile-highlighter">
+      <subant target="compile">
+         <fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
+      </subant>
+    </target>

-    <target name="init" depends="common.init,compile-demo,check-files"/>
+    <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>

    <!-- make sure online collections (reuters) are first downloaded -->
    <target name="test" depends="init,get-files">
--- a/contrib/benchmark/conf/highlight-profile.alg
+++ b/contrib/benchmark/conf/highlight-profile.alg
@ -0,0 +1,68 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=true
+doc.term.vector.offsets=true
+doc.term.vector.positions=true
+doc.add.log.step=2000
+
+docs.dir=reuters-out
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc } : 20000
+        Optimize
+        CloseIndex
+    }
+{ "Rounds"
+
+    ResetSystemSoft
+
+
+    OpenReader
+      { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
+
+    CloseReader
+
+    RepSumByPref MAddDocs
+
+    NewRound
+
+} : 4
+
+RepSumByNameRound
+RepSumByName
+RepSumByPrefRound MAddDocs
--- a/contrib/benchmark/conf/standard-highlights-notv.alg
+++ b/contrib/benchmark/conf/standard-highlights-notv.alg
@ -0,0 +1,69 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=false
+doc.term.vector.offsets=false
+doc.term.vector.positions=false
+doc.add.log.step=2000
+
+docs.dir=reuters-out
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc } : 20000
+        Optimize
+        CloseIndex
+}
+{ "Rounds"
+
+    ResetSystemSoft
+    OpenReader
+      { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
+    CloseReader
+    OpenReader
+      { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
+
+    CloseReader
+
+    RepSumByPref SearchHlgtSameRdr
+
+    NewRound
+
+} : 2
+
+RepSumByNameRound
+RepSumByName
+RepSumByPrefRound MAddDocs
--- a/contrib/benchmark/conf/standard-highlights-tv.alg
+++ b/contrib/benchmark/conf/standard-highlights-tv.alg
@ -0,0 +1,69 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=true
+doc.term.vector.offsets=true
+doc.term.vector.positions=true
+doc.add.log.step=2000
+
+docs.dir=reuters-out
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc } : 20000
+        Optimize
+        CloseIndex
+}
+{ "Rounds"
+
+    ResetSystemSoft
+    OpenReader
+      { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
+    CloseReader
+    OpenReader
+      { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
+
+    CloseReader
+
+    RepSumByPref SearchHlgtSameRdr
+
+    NewRound
+
+} : 2
+
+RepSumByNameRound
+RepSumByName
+RepSumByPrefRound MAddDocs
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
@ -247,6 +247,10 @@ The following is an informal description of the supported syntax.
   <li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string
              parameter: a comma separated list of Fields to load.
   </li>
+   <li><b>SearchTravRetHighlighterTask</b> takes a string
+              parameter: a comma separated list of parameters to define highlighting.  See that
+     tasks javadocs for more information
+   </li>
 </ul>
 <br>Example - <font color="#FF0066">AddDoc(2000)</font> - would add a document
 of size 2000 (~bytes).
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@ -17,26 +17,31 @@ package org.apache.lucene.benchmark.byTask.tasks;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.highlight.*;
 import org.apache.lucene.store.Directory;

 import java.io.IOException;
+import java.util.*;


 /**
 * Read index (abstract) task.
 * Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve()
 * methods to configure the actual action.
- * 
- * <p>Note: All ReadTasks reuse the reader if it is already open. 
+ * <p/>
+ * <p>Note: All ReadTasks reuse the reader if it is already open.
 * Otherwise a reader is opened at start and closed at the end.
- *  
+ * <p/>
 * <p>Other side effects: none.
 */
 public abstract class ReadTask extends PerfTask {
@ -48,7 +53,7 @@ public abstract class ReadTask extends PerfTask {
  public int doLogic() throws Exception {
    int res = 0;
    boolean closeReader = false;
-    
+
    // open reader or use existing one
    IndexReader ir = getRunData().getIndexReader();
    if (ir == null) {
@ -57,18 +62,18 @@ public abstract class ReadTask extends PerfTask {
      closeReader = true;
      //res++; //this is confusing, comment it out
    }
-    
+
    // optionally warm and add num docs traversed to count
    if (withWarm()) {
      Document doc = null;
      for (int m = 0; m < ir.maxDoc(); m++) {
        if (!ir.isDeleted(m)) {
          doc = ir.document(m);
-          res += (doc==null ? 0 : 1);
+          res += (doc == null ? 0 : 1);
        }
      }
    }
-    
+
    if (withSearch()) {
      res++;
      IndexSearcher searcher = new IndexSearcher(ir);
@ -76,32 +81,53 @@ public abstract class ReadTask extends PerfTask {
      Query q = queryMaker.makeQuery();
      Hits hits = searcher.search(q);
      //System.out.println("searched: "+q);
-      
-      if (withTraverse() && hits!=null) {
+
+      if (withTraverse() && hits != null) {
        int traversalSize = Math.min(hits.length(), traversalSize());
        if (traversalSize > 0) {
          boolean retrieve = withRetrieve();
-          for (int m = 0; m < hits.length(); m++) {
+          int numHighlight = Math.min(numToHighlight(), hits.length());
+          Analyzer analyzer = getRunData().getAnalyzer();
+          Highlighter highlighter = null;
+          int maxFrags = 1;
+          if (numHighlight > 0) {
+            highlighter = getHighlighter(q);
+            maxFrags = maxNumFragments();
+          }
+          boolean merge = isMergeContiguousFragments();
+          for (int m = 0; m < traversalSize; m++) {
            int id = hits.id(m);
            res++;
            if (retrieve) {
-              res += retrieveDoc(ir, id);
+              Document document = retrieveDoc(ir, id);
+              res += document != null ? 1 : 0;
+              if (numHighlight > 0 && m < numHighlight) {
+                Collection/*<String>*/ fieldsToHighlight = getFieldsToHighlight(document);
+                for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
+                  String field = (String) iterator.next();
+                  String text = document.get(field);
+                  TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
+                  res += doHighlight(ts, text, highlighter, merge, maxFrags);
+                }
+              }
            }
          }
        }
      }
-      
+
      searcher.close();
    }
-    
+
    if (closeReader) {
      ir.close();
    }
    return res;
  }

-  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
-    return (ir.document(id) == null ? 0 : 1);
+
+
+  protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
+    return ir.document(id);
  }

  /**
@ -112,33 +138,82 @@ public abstract class ReadTask extends PerfTask {
  /**
   * Return true if search should be performed.
   */
-  public abstract boolean withSearch ();
+  public abstract boolean withSearch();

  /**
   * Return true if warming should be performed.
   */
-  public abstract boolean withWarm ();
-  
+  public abstract boolean withWarm();
+
  /**
   * Return true if, with search, results should be traversed.
   */
-  public abstract boolean withTraverse ();
+  public abstract boolean withTraverse();

  /**
   * Specify the number of hits to traverse.  Tasks should override this if they want to restrict the number
   * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
-   *
+   * <p/>
   * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
+   *
   * @return Integer.MAX_VALUE
   */
-  public int traversalSize()
-  {
+  public int traversalSize() {
    return Integer.MAX_VALUE;
  }

  /**
   * Return true if, with search & results traversing, docs should be retrieved.
   */
-  public abstract boolean withRetrieve ();
+  public abstract boolean withRetrieve();
+
+  /**
+   * Set to the number of documents to highlight.
+   *
+   * @return The number of the results to highlight.  O means no docs will be highlighted.
+   */
+  public int numToHighlight() {
+    return 0;
+  }
+
+  protected Highlighter getHighlighter(Query q){
+    return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
+  }
+
+  /**
+   *
+   * @return the maxiumum number of highlighter fragments
+   */
+  public int maxNumFragments(){
+    return 10;
+  }
+
+  /**
+   *
+   * @return true if the highlighter should merge contiguous fragments
+   */
+  public boolean isMergeContiguousFragments(){
+    return false;
+  }
+
+  protected int doHighlight(TokenStream ts, String text,  Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
+    TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
+    return frag != null ? frag.length : 0;
+  }
+
+  /**
+   * Define the fields to highlight.  Base implementation returns all fields
+   * @param document The Document
+   * @return A Collection of Field names (Strings)
+   */
+  protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
+    List/*<Fieldable>*/ fieldables = document.getFields();
+    Set/*<String>*/ result = new HashSet(fieldables.size());
+    for (Iterator iterator = fieldables.iterator(); iterator.hasNext();) {
+      Fieldable fieldable = (Fieldable) iterator.next();
+      result.add(fieldable.name());
+    }
+    return result;
+  }

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
@ -0,0 +1,126 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.document.Document;
+
+import java.util.Set;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Collections;
+
+/**
+ * Search and Traverse and Retrieve docs task.  Highlight the fields in the retrieved documents.
+ *
+ * Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
+ *
+ * <p>Note: This task reuses the reader if it is already open.
+ * Otherwise a reader is opened at start and closed at the end.
+ * </p>
+ *
+ * <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p>
+ * <ul>
+ * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
+ * <li>highlight - The number of the hits to highlight.  Will always be less than or equal to traversal size.  Default is Integer.MAX_VALUE (i.e. hits.length())</li>
+ * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
+ * <li>mergeContiguous - true if contiguous fragments should be merged.</li>
+ * <li>fields - The fields to highlight.  If not specified all fields will be highlighted (or at least attempted)</li>
+ * </ul>
+ * Example:
+ * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
+ * </pre>
+ *
+ * Documents must be stored in order for this task to work.  Additionally, term vector positions can be used as well.
+ *
+ * <p>Other side effects: counts additional 1 (record) for each traversed hit,
+ * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
+ */
+public class SearchTravRetHighlightTask extends SearchTravTask {
+
+  protected int numToHighlight = Integer.MAX_VALUE;
+  protected boolean mergeContiguous;
+  protected int maxFrags = 2;
+  protected Set paramFields = Collections.EMPTY_SET;
+  
+
+  public SearchTravRetHighlightTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  public void setup() throws Exception {
+    super.setup();
+    //check to make sure either the doc is being stored
+    PerfRunData data = getRunData();
+    if (data.getConfig().get("doc.stored", false) == false){
+      throw new Exception("doc.stored must be set to true");
+    }
+  }
+
+  public boolean withRetrieve() {
+    return true;
+  }
+
+  public int numToHighlight() {
+    return numToHighlight;
+  }
+
+  public boolean isMergeContiguousFragments() {
+    return mergeContiguous;
+  }
+
+  public int maxNumFragments() {
+    return maxFrags;
+  }
+
+  protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
+    Collection result = super.getFieldsToHighlight(document);
+    //if stored is false, then result will be empty, in which case just get all the param fields
+    if (paramFields.isEmpty() == false && result.isEmpty() == false) {
+      result.retainAll(paramFields);
+    } else {
+      result = paramFields;
+    }
+    return result;
+  }
+
+  public void setParams(String params) {
+    String [] splits = params.split(",");
+    for (int i = 0; i < splits.length; i++) {
+      if (splits[i].startsWith("size[") == true){
+        traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
+      } else if (splits[i].startsWith("highlight[") == true){
+        numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
+      } else if (splits[i].startsWith("maxFrags[") == true){
+        maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
+      } else if (splits[i].startsWith("mergeContiguous[") == true){
+        mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
+      } else if (splits[i].startsWith("fields[") == true){
+        paramFields = new HashSet();
+        String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
+        String [] fieldSplits = fieldNames.split(";");
+        for (int j = 0; j < fieldSplits.length; j++) {
+          paramFields.add(fieldSplits[j]);          
+        }
+
+      }
+    }
+  }
+
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;

 import java.util.StringTokenizer;
@ -51,8 +52,8 @@ public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
  }


-  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
-    return (ir.document(id, fieldSelector) == null ? 0 : 1);
+  protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
+    return ir.document(id, fieldSelector);
  }

  public void setParams(String params) {
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
@ -120,7 +120,7 @@ public class Algorithm {
              if ((char)stok.ttype == '*') {
                ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
              } else {
-                if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
+                if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected repetitions number: - "+stok.toString());
                ((TaskSequence)prevTask).setRepetitions((int)stok.nval);
              }
              // check for rate specification (ops/min)
@ -130,7 +130,7 @@ public class Algorithm {
              } else {
                // get rate number
                stok.nextToken();
-                if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted rate number: - "+stok.toString());
+                if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected rate number: - "+stok.toString());
                // check for unit - min or sec, sec is default
                stok.nextToken();
                if (stok.ttype!='/') {
@ -138,14 +138,14 @@ public class Algorithm {
                  ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
                } else {
                  stok.nextToken();
-                  if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString());
+                  if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
                  String unit = stok.sval.toLowerCase();
                  if ("min".equals(unit)) {
                    ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
                  } else if ("sec".equals(unit)) {
                    ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec
                  } else {
-                    throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString());
+                    throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
                  }
                }
              }
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -27,7 +27,9 @@ import java.util.Iterator;
 import org.apache.lucene.benchmark.byTask.feeds.DocData;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
@ -94,6 +96,109 @@ public class TestPerfTasksLogic extends TestCase {
    ir.close();
  }

+  public void testHighlighting() throws Exception {
+    // 1. alg definition (required in every "logic" test)
+    String algLines[] = {
+        "doc.stored=true",
+        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "query.maker=" + ReutersQueryMaker.class.getName(),
+        "ResetSystemErase",
+        "CreateIndex",
+        "{ AddDoc } : 1000",
+        "Optimize",
+        "CloseIndex",
+        "OpenReader",
+        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
+        "CloseReader",
+    };
+
+    // 2. we test this value later
+    CountingHighlighterTestTask.numHighlightedResults = 0;
+    CountingHighlighterTestTask.numDocsRetrieved = 0;
+    // 3. execute the algorithm  (required in every "logic" test)
+    Benchmark benchmark = execBenchmark(algLines);
+
+    // 4. test specific checks after the benchmark run completed.
+    assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
+    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
+    //we probably should use a different doc/query maker, but...
+    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
+
+    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
+    // now we should be able to open the index for write.
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+    iw.close();
+    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
+    ir.close();
+  }
+
+  public void testHighlightingTV() throws Exception {
+    // 1. alg definition (required in every "logic" test)
+    String algLines[] = {
+        "doc.stored=true",//doc storage is required in order to have text to highlight
+        "doc.term.vector.offsets=true",
+        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "query.maker=" + ReutersQueryMaker.class.getName(),
+        "ResetSystemErase",
+        "CreateIndex",
+        "{ AddDoc } : 1000",
+        "Optimize",
+        "CloseIndex",
+        "OpenReader",
+        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
+        "CloseReader",
+    };
+
+    // 2. we test this value later
+    CountingHighlighterTestTask.numHighlightedResults = 0;
+    CountingHighlighterTestTask.numDocsRetrieved = 0;
+    // 3. execute the algorithm  (required in every "logic" test)
+    Benchmark benchmark = execBenchmark(algLines);
+
+    // 4. test specific checks after the benchmark run completed.
+    assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved);
+    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
+    //we probably should use a different doc/query maker, but...
+    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
+
+    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
+    // now we should be able to open the index for write.
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+    iw.close();
+    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
+    ir.close();
+  }
+
+  public void testHighlightingNoTvNoStore() throws Exception {
+    // 1. alg definition (required in every "logic" test)
+    String algLines[] = {
+        "doc.stored=false",
+        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "query.maker=" + ReutersQueryMaker.class.getName(),
+        "ResetSystemErase",
+        "CreateIndex",
+        "{ AddDoc } : 1000",
+        "Optimize",
+        "CloseIndex",
+        "OpenReader",
+        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
+        "CloseReader",
+    };
+
+    // 2. we test this value later
+    CountingHighlighterTestTask.numHighlightedResults = 0;
+    CountingHighlighterTestTask.numDocsRetrieved = 0;
+    // 3. execute the algorithm  (required in every "logic" test)
+    try {
+      Benchmark benchmark = execBenchmark(algLines);
+      assertTrue("CountingHighlighterTest should have thrown an exception", false);
+    } catch (Exception e) {
+      assertTrue(true);
+    }
+  }
+
  /**
   * Test Exhasting Doc Maker logic
   */
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.benchmark.byTask.tasks;
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.search.highlight.Highlighter;
+import org.apache.lucene.search.highlight.TextFragment;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+
+/**
+ * Test Search task which counts number of searches.
+ */
+public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
+
+  public static int numHighlightedResults = 0;
+  public static int numDocsRetrieved = 0;
+
+  public CountingHighlighterTestTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
+    Document document = ir.document(id);
+    if (document != null) {
+      numDocsRetrieved++;
+    }
+    return document;
+  }
+
+  protected int doHighlight(TokenStream ts, String text,  Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException {
+    TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
+    numHighlightedResults += frag != null ? frag.length : 0;
+    return frag != null ? frag.length : 0;
+  }
+
+
+}