LUCENE-837:

Added optional bytes field to store on the Document. Enabled ReutersDocMaker w/ the ability to store byte data in a field. If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array. Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@521569 13f79535-47bb-0310-9956-ffa450edef68
2007-03-23 03:48:12 +00:00 · 2007-03-23 03:48:12 +00:00 · f4fffb3491
parent 34b560603a
commit f4fffb3491
5 changed files with 109 additions and 29 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
 $Id:$
 3/22/07
 -Moved withRetrieve() call out of the loop in ReadTask
 -Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
 -Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
 3/21/07
 Tests (for benchmarking code correctness) were added - LUCENE-840.
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds;
 * limitations under the License.
 */
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.Format;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Properties;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.Format;
 /**
 * Create documents for the test.
@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker {
    Date date;
    String title;
    String body;
    byte [] bytes;
    Properties props;
  }
@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker {
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
    }
    if (docData.bytes != null && docData.bytes.length != 0)
    {
      doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
    }
    if (docData.props!=null) {
      for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
        String key = (String) it.next();
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
 * limitations under the License.
 */
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
@ -25,11 +27,14 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Locale;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 /**
 * A DocMaker using the Reuters collection for its input.
 *
 * Config properties:
 * docs.dir=&lt;path to the docs dir| Default: reuters-out&gt;
 * reuters.doc.maker.store.bytes=true|false Default: false
 *
 */
 public class ReutersDocMaker extends BasicDocMaker {
@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker {
  private ArrayList inputFiles = new ArrayList();
  private int nextFile = 0;
  private int iteration=0;
-  
+  private boolean storeBytes = false;
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker {
    super.setConfig(config);
    String d = config.get("docs.dir","reuters-out");
    dataDir = new File(new File("work"),d);
    storeBytes = config.get("reuters.doc.maker.store.bytes", false);
    collectFiles(dataDir,inputFiles);
    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker {
    dd.name = name;
    dd.title = title;
    dd.body = bodyBuf.toString();
    if (storeBytes == true)
    {
      dd.bytes = dd.body.getBytes("UTF-8");
    }
    return dd;
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
 import java.io.IOException;
 /**
@ -77,16 +78,14 @@ public abstract class ReadTask extends PerfTask {
      //System.out.println("searched: "+q);
      if (withTraverse() && hits!=null) {
        Document doc = null;
        int traversalSize = Math.min(hits.length(), traversalSize());
        if (traversalSize > 0) {
          boolean retrieve = withRetrieve();
          for (int m = 0; m < hits.length(); m++) {
            int id = hits.id(m);
            res++;
-
+            if (retrieve) {
-            if (withRetrieve()) {
+              res += retrieveDoc(ir, id);
              doc = ir.document(id);
              res += (doc==null ? 0 : 1);
            }
          }
        }
@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask {
    return res;
  }
  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
    return (ir.document(id) == null ? 0 : 1);
  }
  /**
   * Return query maker used for this task.
   */
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
@ -0,0 +1,55 @@
 package org.apache.lucene.benchmark.byTask.tasks;
 /**
 * Created by IntelliJ IDEA.
 * User: Grant Ingersoll
 * Date: Mar 22, 2007
 * Time: 10:04:49 PM
 * $Id:$
 * Copyright 2007.  Center For Natural Language Processing
 */
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.document.SetBasedFieldSelector;
 import org.apache.lucene.index.IndexReader;
 import java.util.StringTokenizer;
 import java.util.Set;
 import java.util.HashSet;
 import java.util.Collections;
 import java.io.IOException;
 /**
 * Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
 *
 * <p>Note: This task reuses the reader if it is already open.
 * Otherwise a reader is opened at start and closed at the end.
 *
 * Takes optional param: comma separated list of Fields to load.
 */
 public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
  protected FieldSelector fieldSelector;
  public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
    super(runData);
  }
  public boolean withRetrieve() {
    return true;
  }
  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
    return (ir.document(id, fieldSelector) == null ? 0 : 1);
  }
  public void setParams(String params) {
    Set fieldsToLoad = new HashSet();
    for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
      String s = tokenizer.nextToken();
      fieldsToLoad.add(s);
    }
    fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
  }
 }