diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 9f8c29f5977..7410f8c22fd 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ + +1/23/2008 + LUCENE-1129: ReadTask properly uses the traversalSize value + LUCENE-1128: Added support for benchmarking the highlighter + 01/20/08 LUCENE-1139: various fixes - add merge.scheduler, merge.policy config properties @@ -12,6 +17,7 @@ $Id:$ - OptimizeTask now takes int param to call optimize(int maxNumSegments) - CloseIndexTask now takes bool param to call close(false) (abort running merges) + 01/03/08 LUCENE-1116: quality package improvements: - add MRR computation; diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml index 95f3913aa0b..46925397741 100644 --- a/contrib/benchmark/build.xml +++ b/contrib/benchmark/build.xml @@ -109,6 +109,7 @@ + @@ -163,9 +164,14 @@ - + + + + + + - + diff --git a/contrib/benchmark/conf/highlight-profile.alg b/contrib/benchmark/conf/highlight-profile.alg new file mode 100644 index 00000000000..9e7d55e1de4 --- /dev/null +++ b/contrib/benchmark/conf/highlight-profile.alg @@ -0,0 +1,68 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- +# multi val params are iterated by NewRound's, added to reports, start with column name. + +ram.flush.mb=flush:32:32 +compound=cmpnd:true:false + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.tokenized=true +doc.term.vector=true +doc.term.vector.offsets=true +doc.term.vector.positions=true +doc.add.log.step=2000 + +docs.dir=reuters-out + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker + +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=true +# ------------------------------------------------------------------------------------- +{ "Populate" + CreateIndex + { "MAddDocs" AddDoc } : 20000 + Optimize + CloseIndex + } +{ "Rounds" + + ResetSystemSoft + + + OpenReader + { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000 + + CloseReader + + RepSumByPref MAddDocs + + NewRound + +} : 4 + +RepSumByNameRound +RepSumByName +RepSumByPrefRound MAddDocs diff --git a/contrib/benchmark/conf/standard-highlights-notv.alg b/contrib/benchmark/conf/standard-highlights-notv.alg new file mode 100644 index 00000000000..a39b67e1713 --- /dev/null +++ b/contrib/benchmark/conf/standard-highlights-notv.alg @@ -0,0 +1,69 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- +# multi val params are iterated by NewRound's, added to reports, start with column name. + +ram.flush.mb=flush:32:32 +compound=cmpnd:true:false + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.tokenized=true +doc.term.vector=false +doc.term.vector.offsets=false +doc.term.vector.positions=false +doc.add.log.step=2000 + +docs.dir=reuters-out + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker + +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=true +# ------------------------------------------------------------------------------------- +{ "Populate" + CreateIndex + { "MAddDocs" AddDoc } : 20000 + Optimize + CloseIndex +} +{ "Rounds" + + ResetSystemSoft + OpenReader + { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 + CloseReader + OpenReader + { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 + + CloseReader + + RepSumByPref SearchHlgtSameRdr + + NewRound + +} : 2 + +RepSumByNameRound +RepSumByName +RepSumByPrefRound MAddDocs diff --git a/contrib/benchmark/conf/standard-highlights-tv.alg b/contrib/benchmark/conf/standard-highlights-tv.alg new file mode 100644 index 00000000000..f6e59d46011 --- /dev/null +++ b/contrib/benchmark/conf/standard-highlights-tv.alg @@ -0,0 +1,69 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- +# multi val params are iterated by NewRound's, added to reports, start with column name. + +ram.flush.mb=flush:32:32 +compound=cmpnd:true:false + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.tokenized=true +doc.term.vector=true +doc.term.vector.offsets=true +doc.term.vector.positions=true +doc.add.log.step=2000 + +docs.dir=reuters-out + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker + +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=true +# ------------------------------------------------------------------------------------- +{ "Populate" + CreateIndex + { "MAddDocs" AddDoc } : 20000 + Optimize + CloseIndex +} +{ "Rounds" + + ResetSystemSoft + OpenReader + { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 + CloseReader + OpenReader + { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 + + CloseReader + + RepSumByPref SearchHlgtSameRdr + + NewRound + +} : 2 + +RepSumByNameRound +RepSumByName +RepSumByPrefRound MAddDocs diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html index 520c0f3ad01..bc21ae78902 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html @@ -247,6 +247,10 @@ The following is an informal description of the supported syntax.
  • SearchTravRetLoadFieldSelectorTask takes a string parameter: a comma separated list of Fields to load.
  • +
  • SearchTravRetHighlighterTask takes a string + parameter: a comma separated list of parameters to define highlighting. See that + tasks javadocs for more information +

  • Example - AddDoc(2000) - would add a document of size 2000 (~bytes). diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java index 71ec5af64d3..8336b06f7f3 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java @@ -17,26 +17,31 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.highlight.*; import org.apache.lucene.store.Directory; import java.io.IOException; +import java.util.*; /** * Read index (abstract) task. * Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve() * methods to configure the actual action. - * - *

    Note: All ReadTasks reuse the reader if it is already open. + *

    + *

    Note: All ReadTasks reuse the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. - * + *

    *

    Other side effects: none. */ public abstract class ReadTask extends PerfTask { @@ -48,7 +53,7 @@ public abstract class ReadTask extends PerfTask { public int doLogic() throws Exception { int res = 0; boolean closeReader = false; - + // open reader or use existing one IndexReader ir = getRunData().getIndexReader(); if (ir == null) { @@ -57,18 +62,18 @@ public abstract class ReadTask extends PerfTask { closeReader = true; //res++; //this is confusing, comment it out } - + // optionally warm and add num docs traversed to count if (withWarm()) { Document doc = null; for (int m = 0; m < ir.maxDoc(); m++) { if (!ir.isDeleted(m)) { doc = ir.document(m); - res += (doc==null ? 0 : 1); + res += (doc == null ? 0 : 1); } } } - + if (withSearch()) { res++; IndexSearcher searcher = new IndexSearcher(ir); @@ -76,32 +81,53 @@ public abstract class ReadTask extends PerfTask { Query q = queryMaker.makeQuery(); Hits hits = searcher.search(q); //System.out.println("searched: "+q); - - if (withTraverse() && hits!=null) { + + if (withTraverse() && hits != null) { int traversalSize = Math.min(hits.length(), traversalSize()); if (traversalSize > 0) { boolean retrieve = withRetrieve(); - for (int m = 0; m < hits.length(); m++) { + int numHighlight = Math.min(numToHighlight(), hits.length()); + Analyzer analyzer = getRunData().getAnalyzer(); + Highlighter highlighter = null; + int maxFrags = 1; + if (numHighlight > 0) { + highlighter = getHighlighter(q); + maxFrags = maxNumFragments(); + } + boolean merge = isMergeContiguousFragments(); + for (int m = 0; m < traversalSize; m++) { int id = hits.id(m); res++; if (retrieve) { - res += retrieveDoc(ir, id); + Document document = retrieveDoc(ir, id); + res += document != null ? 1 : 0; + if (numHighlight > 0 && m < numHighlight) { + Collection/**/ fieldsToHighlight = getFieldsToHighlight(document); + for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) { + String field = (String) iterator.next(); + String text = document.get(field); + TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer); + res += doHighlight(ts, text, highlighter, merge, maxFrags); + } + } } } } } - + searcher.close(); } - + if (closeReader) { ir.close(); } return res; } - protected int retrieveDoc(IndexReader ir, int id) throws IOException { - return (ir.document(id) == null ? 0 : 1); + + + protected Document retrieveDoc(IndexReader ir, int id) throws IOException { + return ir.document(id); } /** @@ -112,33 +138,82 @@ public abstract class ReadTask extends PerfTask { /** * Return true if search should be performed. */ - public abstract boolean withSearch (); + public abstract boolean withSearch(); /** * Return true if warming should be performed. */ - public abstract boolean withWarm (); - + public abstract boolean withWarm(); + /** * Return true if, with search, results should be traversed. */ - public abstract boolean withTraverse (); + public abstract boolean withTraverse(); /** * Specify the number of hits to traverse. Tasks should override this if they want to restrict the number * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0. - * + *

    * Read task calculates the traversal as: Math.min(hits.length(), traversalSize()) + * * @return Integer.MAX_VALUE */ - public int traversalSize() - { + public int traversalSize() { return Integer.MAX_VALUE; } /** * Return true if, with search & results traversing, docs should be retrieved. */ - public abstract boolean withRetrieve (); + public abstract boolean withRetrieve(); + + /** + * Set to the number of documents to highlight. + * + * @return The number of the results to highlight. O means no docs will be highlighted. + */ + public int numToHighlight() { + return 0; + } + + protected Highlighter getHighlighter(Query q){ + return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); + } + + /** + * + * @return the maxiumum number of highlighter fragments + */ + public int maxNumFragments(){ + return 10; + } + + /** + * + * @return true if the highlighter should merge contiguous fragments + */ + public boolean isMergeContiguousFragments(){ + return false; + } + + protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException { + TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments); + return frag != null ? frag.length : 0; + } + + /** + * Define the fields to highlight. Base implementation returns all fields + * @param document The Document + * @return A Collection of Field names (Strings) + */ + protected Collection/**/ getFieldsToHighlight(Document document) { + List/**/ fieldables = document.getFields(); + Set/**/ result = new HashSet(fieldables.size()); + for (Iterator iterator = fieldables.iterator(); iterator.hasNext();) { + Fieldable fieldable = (Fieldable) iterator.next(); + result.add(fieldable.name()); + } + return result; + } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java new file mode 100644 index 00000000000..51388f8d1cb --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java @@ -0,0 +1,126 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.document.Document; + +import java.util.Set; +import java.util.Collection; +import java.util.HashSet; +import java.util.Collections; + +/** + * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents. + * + * Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting. + * + *

    Note: This task reuses the reader if it is already open. + * Otherwise a reader is opened at start and closed at the end. + *

    + * + *

    Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]

    + *
      + *
    • traversal size - The number of hits to traverse, otherwise all will be traversed
    • + *
    • highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())
    • + *
    • maxFrags - The maximum number of fragments to score by the highlighter
    • + *
    • mergeContiguous - true if contiguous fragments should be merged.
    • + *
    • fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)
    • + *
    + * Example: + *
    "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
    + * 
    + * + * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well. + * + *

    Other side effects: counts additional 1 (record) for each traversed hit, + * and 1 more for each retrieved (non null) document and 1 for each fragment returned.

    + */ +public class SearchTravRetHighlightTask extends SearchTravTask { + + protected int numToHighlight = Integer.MAX_VALUE; + protected boolean mergeContiguous; + protected int maxFrags = 2; + protected Set paramFields = Collections.EMPTY_SET; + + + public SearchTravRetHighlightTask(PerfRunData runData) { + super(runData); + } + + public void setup() throws Exception { + super.setup(); + //check to make sure either the doc is being stored + PerfRunData data = getRunData(); + if (data.getConfig().get("doc.stored", false) == false){ + throw new Exception("doc.stored must be set to true"); + } + } + + public boolean withRetrieve() { + return true; + } + + public int numToHighlight() { + return numToHighlight; + } + + public boolean isMergeContiguousFragments() { + return mergeContiguous; + } + + public int maxNumFragments() { + return maxFrags; + } + + protected Collection/**/ getFieldsToHighlight(Document document) { + Collection result = super.getFieldsToHighlight(document); + //if stored is false, then result will be empty, in which case just get all the param fields + if (paramFields.isEmpty() == false && result.isEmpty() == false) { + result.retainAll(paramFields); + } else { + result = paramFields; + } + return result; + } + + public void setParams(String params) { + String [] splits = params.split(","); + for (int i = 0; i < splits.length; i++) { + if (splits[i].startsWith("size[") == true){ + traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); + } else if (splits[i].startsWith("highlight[") == true){ + numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); + } else if (splits[i].startsWith("maxFrags[") == true){ + maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); + } else if (splits[i].startsWith("mergeContiguous[") == true){ + mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue(); + } else if (splits[i].startsWith("fields[") == true){ + paramFields = new HashSet(); + String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); + String [] fieldSplits = fieldNames.split(";"); + for (int j = 0; j < fieldSplits.length; j++) { + paramFields.add(fieldSplits[j]); + } + + } + } + } + + +} \ No newline at end of file diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java index a42045cae7d..0e1c3699f9f 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java @@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; +import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import java.util.StringTokenizer; @@ -51,8 +52,8 @@ public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask { } - protected int retrieveDoc(IndexReader ir, int id) throws IOException { - return (ir.document(id, fieldSelector) == null ? 0 : 1); + protected Document retrieveDoc(IndexReader ir, int id) throws IOException { + return ir.document(id, fieldSelector); } public void setParams(String params) { diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java index a8f6e3a2933..a303e049e1b 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java @@ -120,7 +120,7 @@ public class Algorithm { if ((char)stok.ttype == '*') { ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST); } else { - if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString()); + if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected repetitions number: - "+stok.toString()); ((TaskSequence)prevTask).setRepetitions((int)stok.nval); } // check for rate specification (ops/min) @@ -130,7 +130,7 @@ public class Algorithm { } else { // get rate number stok.nextToken(); - if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted rate number: - "+stok.toString()); + if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected rate number: - "+stok.toString()); // check for unit - min or sec, sec is default stok.nextToken(); if (stok.ttype!='/') { @@ -138,14 +138,14 @@ public class Algorithm { ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec } else { stok.nextToken(); - if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString()); + if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); String unit = stok.sval.toLowerCase(); if ("min".equals(unit)) { ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min } else if ("sec".equals(unit)) { ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec } else { - throw new Exception("expexted rate unit: 'min' or 'sec' - "+stok.toString()); + throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); } } } diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 765b889eec3..9b955145ec7 100755 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -27,7 +27,9 @@ import java.util.Iterator; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; +import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; +import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -94,6 +96,109 @@ public class TestPerfTasksLogic extends TestCase { ir.close(); } + public void testHighlighting() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "doc.stored=true", + "doc.maker="+Reuters20DocMaker.class.getName(), + "query.maker=" + ReutersQueryMaker.class.getName(), + "ResetSystemErase", + "CreateIndex", + "{ AddDoc } : 1000", + "Optimize", + "CloseIndex", + "OpenReader", + "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", + "CloseReader", + }; + + // 2. we test this value later + CountingHighlighterTestTask.numHighlightedResults = 0; + CountingHighlighterTestTask.numDocsRetrieved = 0; + // 3. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + // 4. test specific checks after the benchmark run completed. + assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved); + //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs + //we probably should use a different doc/query maker, but... + assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); + + assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); + // now we should be able to open the index for write. + IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false); + iw.close(); + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); + ir.close(); + } + + public void testHighlightingTV() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "doc.stored=true",//doc storage is required in order to have text to highlight + "doc.term.vector.offsets=true", + "doc.maker="+Reuters20DocMaker.class.getName(), + "query.maker=" + ReutersQueryMaker.class.getName(), + "ResetSystemErase", + "CreateIndex", + "{ AddDoc } : 1000", + "Optimize", + "CloseIndex", + "OpenReader", + "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", + "CloseReader", + }; + + // 2. we test this value later + CountingHighlighterTestTask.numHighlightedResults = 0; + CountingHighlighterTestTask.numDocsRetrieved = 0; + // 3. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + // 4. test specific checks after the benchmark run completed. + assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved); + //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs + //we probably should use a different doc/query maker, but... + assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); + + assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); + // now we should be able to open the index for write. + IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false); + iw.close(); + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); + ir.close(); + } + + public void testHighlightingNoTvNoStore() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "doc.stored=false", + "doc.maker="+Reuters20DocMaker.class.getName(), + "query.maker=" + ReutersQueryMaker.class.getName(), + "ResetSystemErase", + "CreateIndex", + "{ AddDoc } : 1000", + "Optimize", + "CloseIndex", + "OpenReader", + "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", + "CloseReader", + }; + + // 2. we test this value later + CountingHighlighterTestTask.numHighlightedResults = 0; + CountingHighlighterTestTask.numDocsRetrieved = 0; + // 3. execute the algorithm (required in every "logic" test) + try { + Benchmark benchmark = execBenchmark(algLines); + assertTrue("CountingHighlighterTest should have thrown an exception", false); + } catch (Exception e) { + assertTrue(true); + } + } + /** * Test Exhasting Doc Maker logic */ diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java new file mode 100644 index 00000000000..1bce22acccf --- /dev/null +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.benchmark.byTask.tasks; + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; + +/** + * Test Search task which counts number of searches. + */ +public class CountingHighlighterTestTask extends SearchTravRetHighlightTask { + + public static int numHighlightedResults = 0; + public static int numDocsRetrieved = 0; + + public CountingHighlighterTestTask(PerfRunData runData) { + super(runData); + } + + protected Document retrieveDoc(IndexReader ir, int id) throws IOException { + Document document = ir.document(id); + if (document != null) { + numDocsRetrieved++; + } + return document; + } + + protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException { + TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments); + numHighlightedResults += frag != null ? frag.length : 0; + return frag != null ? frag.length : 0; + } + + +} \ No newline at end of file