diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index c0fdbc8f38c..c9b6a446e5f 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +3/22/07 + +-Moved withRetrieve() call out of the loop in ReadTask +-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities +-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled) + 3/21/07 Tests (for benchmarking code correctness) were added - LUCENE-840. diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java index cc8ad4ffaec..9fa6e24f14e 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java @@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.Format; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + import java.io.File; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.Properties; -import org.apache.lucene.document.DateTools; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.Format; - /** * Create documents for the test. @@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker { Date date; String title; String body; + byte [] bytes; Properties props; } @@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker { } doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal)); } + if (docData.bytes != null && docData.bytes.length != 0) + { + doc.add(new Field("bytes", docData.bytes, Field.Store.YES)); + } if (docData.props!=null) { for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java index 7a0943c9f63..4219e429af5 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java @@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ +import org.apache.lucene.benchmark.byTask.utils.Config; + import java.io.BufferedReader; import java.io.File; import java.io.FileReader; @@ -25,11 +27,14 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Locale; -import org.apache.lucene.benchmark.byTask.utils.Config; - /** * A DocMaker using the Reuters collection for its input. + * + * Config properties: + * docs.dir=<path to the docs dir| Default: reuters-out> + * reuters.doc.maker.store.bytes=true|false Default: false + * */ public class ReutersDocMaker extends BasicDocMaker { @@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker { private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; private int iteration=0; - + private boolean storeBytes = false; /* (non-Javadoc) * @see SimpleDocMaker#setConfig(java.util.Properties) */ @@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker { super.setConfig(config); String d = config.get("docs.dir","reuters-out"); dataDir = new File(new File("work"),d); + storeBytes = config.get("reuters.doc.maker.store.bytes", false); + collectFiles(dataDir,inputFiles); if (inputFiles.size()==0) { throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); @@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker { dd.name = name; dd.title = title; dd.body = bodyBuf.toString(); + if (storeBytes == true) + { + dd.bytes = dd.body.getBytes("UTF-8"); + } return dd; } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java index f0c1078855a..653c51b6283 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; +import java.io.IOException; /** @@ -76,17 +77,15 @@ public abstract class ReadTask extends PerfTask { Hits hits = searcher.search(q); //System.out.println("searched: "+q); - if (withTraverse() && hits!=null) { - Document doc = null; - int traversalSize = Math.min(hits.length(), traversalSize()); - if (traversalSize > 0) { + if (withTraverse() && hits!=null) { + int traversalSize = Math.min(hits.length(), traversalSize()); + if (traversalSize > 0) { + boolean retrieve = withRetrieve(); for (int m = 0; m < hits.length(); m++) { int id = hits.id(m); res++; - - if (withRetrieve()) { - doc = ir.document(id); - res += (doc==null ? 0 : 1); + if (retrieve) { + res += retrieveDoc(ir, id); } } } @@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask { return res; } + protected int retrieveDoc(IndexReader ir, int id) throws IOException { + return (ir.document(id) == null ? 0 : 1); + } + /** * Return query maker used for this task. */ @@ -122,18 +125,18 @@ public abstract class ReadTask extends PerfTask { public abstract boolean withTraverse (); /** - * Specify the number of hits to traverse. Tasks should override this if they want to restrict the number - * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0. - * - * Read task calculates the traversal as: Math.min(hits.length(), traversalSize()) - * @return Integer.MAX_VALUE - */ - public int traversalSize() - { - return Integer.MAX_VALUE; - } - - /** + * Specify the number of hits to traverse. Tasks should override this if they want to restrict the number + * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0. + * + * Read task calculates the traversal as: Math.min(hits.length(), traversalSize()) + * @return Integer.MAX_VALUE + */ + public int traversalSize() + { + return Integer.MAX_VALUE; + } + + /** * Return true if, with search & results traversing, docs should be retrieved. */ public abstract boolean withRetrieve (); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java new file mode 100644 index 00000000000..aa4f4453f30 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java @@ -0,0 +1,55 @@ +package org.apache.lucene.benchmark.byTask.tasks; +/** + * Created by IntelliJ IDEA. + * User: Grant Ingersoll + * Date: Mar 22, 2007 + * Time: 10:04:49 PM + * $Id:$ + * Copyright 2007. Center For Natural Language Processing + */ + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.SetBasedFieldSelector; +import org.apache.lucene.index.IndexReader; + +import java.util.StringTokenizer; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; +import java.io.IOException; + +/** + * Search and Travrese and Retrieve docs task using a SetBasedFieldSelector. + * + *

Note: This task reuses the reader if it is already open. + * Otherwise a reader is opened at start and closed at the end. + * + * Takes optional param: comma separated list of Fields to load. + */ +public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask { + + protected FieldSelector fieldSelector; + public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) { + super(runData); + + } + + public boolean withRetrieve() { + return true; + } + + + protected int retrieveDoc(IndexReader ir, int id) throws IOException { + return (ir.document(id, fieldSelector) == null ? 0 : 1); + } + + public void setParams(String params) { + Set fieldsToLoad = new HashSet(); + for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) { + String s = tokenizer.nextToken(); + fieldsToLoad.add(s); + } + fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET); + } +}