LUCENE-837:

Added optional bytes field to store on the Document. Enabled ReutersDocMaker w/ the ability to store byte data in a field. If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array. Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@521569 13f79535-47bb-0310-9956-ffa450edef68
2007-03-23 03:48:12 +00:00 · 2007-03-23 03:48:12 +00:00 · f4fffb3491
parent 34b560603a
commit f4fffb3491
5 changed files with 109 additions and 29 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+3/22/07
+
+-Moved withRetrieve() call out of the loop in ReadTask
+-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
+-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
+
 3/21/07

 Tests (for benchmarking code correctness) were added - LUCENE-840.
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds;
 * limitations under the License.
 */

+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.Format;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Properties;

-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;
-

 /**
 * Create documents for the test.
@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker {
    Date date;
    String title;
    String body;
+    byte [] bytes;
    Properties props;
  }
  
@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker {
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
    }
+    if (docData.bytes != null && docData.bytes.length != 0)
+    {
+      doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
+    }
    if (docData.props!=null) {
      for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
        String key = (String) it.next();
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
 * limitations under the License.
 */

+import org.apache.lucene.benchmark.byTask.utils.Config;
+
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
@ -25,11 +27,14 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Locale;

-import org.apache.lucene.benchmark.byTask.utils.Config;
-

 /**
 * A DocMaker using the Reuters collection for its input.
+ *
+ * Config properties:
+ * docs.dir=&lt;path to the docs dir| Default: reuters-out&gt;
+ * reuters.doc.maker.store.bytes=true|false Default: false
+ *
 */
 public class ReutersDocMaker extends BasicDocMaker {

@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker {
  private ArrayList inputFiles = new ArrayList();
  private int nextFile = 0;
  private int iteration=0;
-  
+  private boolean storeBytes = false;
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker {
    super.setConfig(config);
    String d = config.get("docs.dir","reuters-out");
    dataDir = new File(new File("work"),d);
+    storeBytes = config.get("reuters.doc.maker.store.bytes", false);
+
    collectFiles(dataDir,inputFiles);
    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker {
    dd.name = name;
    dd.title = title;
    dd.body = bodyBuf.toString();
+    if (storeBytes == true)
+    {
+      dd.bytes = dd.body.getBytes("UTF-8");
+    }
    return dd;
  }

--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;

+import java.io.IOException;


 /**
@ -76,17 +77,15 @@ public abstract class ReadTask extends PerfTask {
      Hits hits = searcher.search(q);
      //System.out.println("searched: "+q);
      
-      if (withTraverse() && hits!=null) {
-        Document doc = null;
-        int traversalSize = Math.min(hits.length(), traversalSize());
-        if (traversalSize > 0) {
+      if (withTraverse() && hits!=null) {
+        int traversalSize = Math.min(hits.length(), traversalSize());
+        if (traversalSize > 0) {
+          boolean retrieve = withRetrieve();
          for (int m = 0; m < hits.length(); m++) {
            int id = hits.id(m);
            res++;
-
-            if (withRetrieve()) {
-              doc = ir.document(id);
-              res += (doc==null ? 0 : 1);
+            if (retrieve) {
+              res += retrieveDoc(ir, id);
            }
          }
        }
@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask {
    return res;
  }

+  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
+    return (ir.document(id) == null ? 0 : 1);
+  }
+
  /**
   * Return query maker used for this task.
   */
@ -122,18 +125,18 @@ public abstract class ReadTask extends PerfTask {
  public abstract boolean withTraverse ();

  /**
-   * Specify the number of hits to traverse.  Tasks should override this if they want to restrict the number
-   * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
-   *
-   * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
-   * @return Integer.MAX_VALUE
-   */
-  public int traversalSize()
-  {
-    return Integer.MAX_VALUE;
-  }
-
-  /**
+   * Specify the number of hits to traverse.  Tasks should override this if they want to restrict the number
+   * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
+   *
+   * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
+   * @return Integer.MAX_VALUE
+   */
+  public int traversalSize()
+  {
+    return Integer.MAX_VALUE;
+  }
+
+  /**
   * Return true if, with search & results traversing, docs should be retrieved.
   */
  public abstract boolean withRetrieve ();
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
@ -0,0 +1,55 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+/**
+ * Created by IntelliJ IDEA.
+ * User: Grant Ingersoll
+ * Date: Mar 22, 2007
+ * Time: 10:04:49 PM
+ * $Id:$
+ * Copyright 2007.  Center For Natural Language Processing
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.index.IndexReader;
+
+import java.util.StringTokenizer;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collections;
+import java.io.IOException;
+
+/**
+ * Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
+ *
+ * <p>Note: This task reuses the reader if it is already open.
+ * Otherwise a reader is opened at start and closed at the end.
+ *
+ * Takes optional param: comma separated list of Fields to load.
+ */
+public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
+
+  protected FieldSelector fieldSelector;
+  public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
+    super(runData);
+    
+  }
+
+  public boolean withRetrieve() {
+    return true;
+  }
+
+
+  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
+    return (ir.document(id, fieldSelector) == null ? 0 : 1);
+  }
+
+  public void setParams(String params) {
+    Set fieldsToLoad = new HashSet();
+    for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
+      String s = tokenizer.nextToken();
+      fieldsToLoad.add(s);
+    }
+    fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
+  }
+}