LUCENE-837:

Added optional bytes field to store on the Document.  Enabled ReutersDocMaker w/ the ability to store byte data in a field.  If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array.

Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@521569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-03-23 03:48:12 +00:00
parent 34b560603a
commit f4fffb3491
5 changed files with 109 additions and 29 deletions

View File

@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
3/22/07
-Moved withRetrieve() call out of the loop in ReadTask
-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
3/21/07
Tests (for benchmarking code correctness) were added - LUCENE-840.

View File

@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Properties;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
/**
* Create documents for the test.
@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker {
Date date;
String title;
String body;
byte [] bytes;
Properties props;
}
@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker {
}
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
}
if (docData.bytes != null && docData.bytes.length != 0)
{
doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
}
if (docData.props!=null) {
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
String key = (String) it.next();

View File

@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.utils.Config;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
@ -25,11 +27,14 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config;
/**
* A DocMaker using the Reuters collection for its input.
*
* Config properties:
* docs.dir=<path to the docs dir| Default: reuters-out>
* reuters.doc.maker.store.bytes=true|false Default: false
*
*/
public class ReutersDocMaker extends BasicDocMaker {
@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker {
private ArrayList inputFiles = new ArrayList();
private int nextFile = 0;
private int iteration=0;
private boolean storeBytes = false;
/* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties)
*/
@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker {
super.setConfig(config);
String d = config.get("docs.dir","reuters-out");
dataDir = new File(new File("work"),d);
storeBytes = config.get("reuters.doc.maker.store.bytes", false);
collectFiles(dataDir,inputFiles);
if (inputFiles.size()==0) {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker {
dd.name = name;
dd.title = title;
dd.body = bodyBuf.toString();
if (storeBytes == true)
{
dd.bytes = dd.body.getBytes("UTF-8");
}
return dd;
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import java.io.IOException;
/**
@ -76,17 +77,15 @@ public abstract class ReadTask extends PerfTask {
Hits hits = searcher.search(q);
//System.out.println("searched: "+q);
if (withTraverse() && hits!=null) {
Document doc = null;
int traversalSize = Math.min(hits.length(), traversalSize());
if (traversalSize > 0) {
if (withTraverse() && hits!=null) {
int traversalSize = Math.min(hits.length(), traversalSize());
if (traversalSize > 0) {
boolean retrieve = withRetrieve();
for (int m = 0; m < hits.length(); m++) {
int id = hits.id(m);
res++;
if (withRetrieve()) {
doc = ir.document(id);
res += (doc==null ? 0 : 1);
if (retrieve) {
res += retrieveDoc(ir, id);
}
}
}
@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask {
return res;
}
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id) == null ? 0 : 1);
}
/**
* Return query maker used for this task.
*/
@ -122,18 +125,18 @@ public abstract class ReadTask extends PerfTask {
public abstract boolean withTraverse ();
/**
* Specify the number of hits to traverse. Tasks should override this if they want to restrict the number
* of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
*
* Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
* @return Integer.MAX_VALUE
*/
public int traversalSize()
{
return Integer.MAX_VALUE;
}
/**
* Specify the number of hits to traverse. Tasks should override this if they want to restrict the number
* of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
*
* Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
* @return Integer.MAX_VALUE
*/
public int traversalSize()
{
return Integer.MAX_VALUE;
}
/**
* Return true if, with search & results traversing, docs should be retrieved.
*/
public abstract boolean withRetrieve ();

View File

@ -0,0 +1,55 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Created by IntelliJ IDEA.
* User: Grant Ingersoll
* Date: Mar 22, 2007
* Time: 10:04:49 PM
* $Id:$
* Copyright 2007. Center For Natural Language Processing
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import java.util.StringTokenizer;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
import java.io.IOException;
/**
* Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
*
* <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end.
*
* Takes optional param: comma separated list of Fields to load.
*/
public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
protected FieldSelector fieldSelector;
public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
super(runData);
}
public boolean withRetrieve() {
return true;
}
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id, fieldSelector) == null ? 0 : 1);
}
public void setParams(String params) {
Set fieldsToLoad = new HashSet();
for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
String s = tokenizer.nextToken();
fieldsToLoad.add(s);
}
fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
}
}