LUCENE-837:

Added optional bytes field to store on the Document.  Enabled ReutersDocMaker w/ the ability to store byte data in a field.  If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array.

Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@521569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-03-23 03:48:12 +00:00
parent 34b560603a
commit f4fffb3491
5 changed files with 109 additions and 29 deletions

View File

@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$ $Id:$
3/22/07
-Moved withRetrieve() call out of the loop in ReadTask
-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
3/21/07 3/21/07
Tests (for benchmarking code correctness) were added - LUCENE-840. Tests (for benchmarking code correctness) were added - LUCENE-840.

View File

@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Properties; import java.util.Properties;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
/** /**
* Create documents for the test. * Create documents for the test.
@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker {
Date date; Date date;
String title; String title;
String body; String body;
byte [] bytes;
Properties props; Properties props;
} }
@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker {
} }
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal)); doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
} }
if (docData.bytes != null && docData.bytes.length != 0)
{
doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
}
if (docData.props!=null) { if (docData.props!=null) {
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) { for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
String key = (String) it.next(); String key = (String) it.next();

View File

@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.benchmark.byTask.utils.Config;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
@ -25,11 +27,14 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Locale; import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config;
/** /**
* A DocMaker using the Reuters collection for its input. * A DocMaker using the Reuters collection for its input.
*
* Config properties:
* docs.dir=<path to the docs dir| Default: reuters-out>
* reuters.doc.maker.store.bytes=true|false Default: false
*
*/ */
public class ReutersDocMaker extends BasicDocMaker { public class ReutersDocMaker extends BasicDocMaker {
@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker {
private ArrayList inputFiles = new ArrayList(); private ArrayList inputFiles = new ArrayList();
private int nextFile = 0; private int nextFile = 0;
private int iteration=0; private int iteration=0;
private boolean storeBytes = false;
/* (non-Javadoc) /* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties) * @see SimpleDocMaker#setConfig(java.util.Properties)
*/ */
@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker {
super.setConfig(config); super.setConfig(config);
String d = config.get("docs.dir","reuters-out"); String d = config.get("docs.dir","reuters-out");
dataDir = new File(new File("work"),d); dataDir = new File(new File("work"),d);
storeBytes = config.get("reuters.doc.maker.store.bytes", false);
collectFiles(dataDir,inputFiles); collectFiles(dataDir,inputFiles);
if (inputFiles.size()==0) { if (inputFiles.size()==0) {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker {
dd.name = name; dd.name = name;
dd.title = title; dd.title = title;
dd.body = bodyBuf.toString(); dd.body = bodyBuf.toString();
if (storeBytes == true)
{
dd.bytes = dd.body.getBytes("UTF-8");
}
return dd; return dd;
} }

View File

@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import java.io.IOException;
/** /**
@ -77,16 +78,14 @@ public abstract class ReadTask extends PerfTask {
//System.out.println("searched: "+q); //System.out.println("searched: "+q);
if (withTraverse() && hits!=null) { if (withTraverse() && hits!=null) {
Document doc = null;
int traversalSize = Math.min(hits.length(), traversalSize()); int traversalSize = Math.min(hits.length(), traversalSize());
if (traversalSize > 0) { if (traversalSize > 0) {
boolean retrieve = withRetrieve();
for (int m = 0; m < hits.length(); m++) { for (int m = 0; m < hits.length(); m++) {
int id = hits.id(m); int id = hits.id(m);
res++; res++;
if (retrieve) {
if (withRetrieve()) { res += retrieveDoc(ir, id);
doc = ir.document(id);
res += (doc==null ? 0 : 1);
} }
} }
} }
@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask {
return res; return res;
} }
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id) == null ? 0 : 1);
}
/** /**
* Return query maker used for this task. * Return query maker used for this task.
*/ */

View File

@ -0,0 +1,55 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Created by IntelliJ IDEA.
* User: Grant Ingersoll
* Date: Mar 22, 2007
* Time: 10:04:49 PM
* $Id:$
* Copyright 2007. Center For Natural Language Processing
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import java.util.StringTokenizer;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
import java.io.IOException;
/**
* Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
*
* <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end.
*
* Takes optional param: comma separated list of Fields to load.
*/
public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
protected FieldSelector fieldSelector;
public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
super(runData);
}
public boolean withRetrieve() {
return true;
}
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
return (ir.document(id, fieldSelector) == null ? 0 : 1);
}
public void setParams(String params) {
Set fieldsToLoad = new HashSet();
for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
String s = tokenizer.nextToken();
fieldsToLoad.add(s);
}
fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
}
}