mirror of https://github.com/apache/lucene.git
LUCENE-837:
Added optional bytes field to store on the Document. Enabled ReutersDocMaker w/ the ability to store byte data in a field. If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array. Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@521569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
34b560603a
commit
f4fffb3491
|
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
|
||||||
|
3/22/07
|
||||||
|
|
||||||
|
-Moved withRetrieve() call out of the loop in ReadTask
|
||||||
|
-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
|
||||||
|
-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
|
||||||
|
|
||||||
3/21/07
|
3/21/07
|
||||||
|
|
||||||
Tests (for benchmarking code correctness) were added - LUCENE-840.
|
Tests (for benchmarking code correctness) were added - LUCENE-840.
|
||||||
|
|
|
@ -17,18 +17,18 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Format;
|
||||||
|
import org.apache.lucene.document.DateTools;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
import org.apache.lucene.document.DateTools;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Format;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create documents for the test.
|
* Create documents for the test.
|
||||||
|
@ -44,6 +44,7 @@ public abstract class BasicDocMaker implements DocMaker {
|
||||||
Date date;
|
Date date;
|
||||||
String title;
|
String title;
|
||||||
String body;
|
String body;
|
||||||
|
byte [] bytes;
|
||||||
Properties props;
|
Properties props;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,6 +124,10 @@ public abstract class BasicDocMaker implements DocMaker {
|
||||||
}
|
}
|
||||||
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
|
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
|
||||||
}
|
}
|
||||||
|
if (docData.bytes != null && docData.bytes.length != 0)
|
||||||
|
{
|
||||||
|
doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
|
||||||
|
}
|
||||||
if (docData.props!=null) {
|
if (docData.props!=null) {
|
||||||
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
|
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
|
||||||
String key = (String) it.next();
|
String key = (String) it.next();
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
|
@ -25,11 +27,14 @@ import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A DocMaker using the Reuters collection for its input.
|
* A DocMaker using the Reuters collection for its input.
|
||||||
|
*
|
||||||
|
* Config properties:
|
||||||
|
* docs.dir=<path to the docs dir| Default: reuters-out>
|
||||||
|
* reuters.doc.maker.store.bytes=true|false Default: false
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public class ReutersDocMaker extends BasicDocMaker {
|
public class ReutersDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
|
@ -38,7 +43,7 @@ public class ReutersDocMaker extends BasicDocMaker {
|
||||||
private ArrayList inputFiles = new ArrayList();
|
private ArrayList inputFiles = new ArrayList();
|
||||||
private int nextFile = 0;
|
private int nextFile = 0;
|
||||||
private int iteration=0;
|
private int iteration=0;
|
||||||
|
private boolean storeBytes = false;
|
||||||
/* (non-Javadoc)
|
/* (non-Javadoc)
|
||||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||||
*/
|
*/
|
||||||
|
@ -46,6 +51,8 @@ public class ReutersDocMaker extends BasicDocMaker {
|
||||||
super.setConfig(config);
|
super.setConfig(config);
|
||||||
String d = config.get("docs.dir","reuters-out");
|
String d = config.get("docs.dir","reuters-out");
|
||||||
dataDir = new File(new File("work"),d);
|
dataDir = new File(new File("work"),d);
|
||||||
|
storeBytes = config.get("reuters.doc.maker.store.bytes", false);
|
||||||
|
|
||||||
collectFiles(dataDir,inputFiles);
|
collectFiles(dataDir,inputFiles);
|
||||||
if (inputFiles.size()==0) {
|
if (inputFiles.size()==0) {
|
||||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||||
|
@ -89,6 +96,10 @@ public class ReutersDocMaker extends BasicDocMaker {
|
||||||
dd.name = name;
|
dd.name = name;
|
||||||
dd.title = title;
|
dd.title = title;
|
||||||
dd.body = bodyBuf.toString();
|
dd.body = bodyBuf.toString();
|
||||||
|
if (storeBytes == true)
|
||||||
|
{
|
||||||
|
dd.bytes = dd.body.getBytes("UTF-8");
|
||||||
|
}
|
||||||
return dd;
|
return dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -77,16 +78,14 @@ public abstract class ReadTask extends PerfTask {
|
||||||
//System.out.println("searched: "+q);
|
//System.out.println("searched: "+q);
|
||||||
|
|
||||||
if (withTraverse() && hits!=null) {
|
if (withTraverse() && hits!=null) {
|
||||||
Document doc = null;
|
|
||||||
int traversalSize = Math.min(hits.length(), traversalSize());
|
int traversalSize = Math.min(hits.length(), traversalSize());
|
||||||
if (traversalSize > 0) {
|
if (traversalSize > 0) {
|
||||||
|
boolean retrieve = withRetrieve();
|
||||||
for (int m = 0; m < hits.length(); m++) {
|
for (int m = 0; m < hits.length(); m++) {
|
||||||
int id = hits.id(m);
|
int id = hits.id(m);
|
||||||
res++;
|
res++;
|
||||||
|
if (retrieve) {
|
||||||
if (withRetrieve()) {
|
res += retrieveDoc(ir, id);
|
||||||
doc = ir.document(id);
|
|
||||||
res += (doc==null ? 0 : 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -101,6 +100,10 @@ public abstract class ReadTask extends PerfTask {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||||
|
return (ir.document(id) == null ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return query maker used for this task.
|
* Return query maker used for this task.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: Grant Ingersoll
|
||||||
|
* Date: Mar 22, 2007
|
||||||
|
* Time: 10:04:49 PM
|
||||||
|
* $Id:$
|
||||||
|
* Copyright 2007. Center For Natural Language Processing
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.document.FieldSelector;
|
||||||
|
import org.apache.lucene.document.SetBasedFieldSelector;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
|
||||||
|
*
|
||||||
|
* <p>Note: This task reuses the reader if it is already open.
|
||||||
|
* Otherwise a reader is opened at start and closed at the end.
|
||||||
|
*
|
||||||
|
* Takes optional param: comma separated list of Fields to load.
|
||||||
|
*/
|
||||||
|
public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
|
||||||
|
|
||||||
|
protected FieldSelector fieldSelector;
|
||||||
|
public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
|
||||||
|
super(runData);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean withRetrieve() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected int retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||||
|
return (ir.document(id, fieldSelector) == null ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParams(String params) {
|
||||||
|
Set fieldsToLoad = new HashSet();
|
||||||
|
for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
|
||||||
|
String s = tokenizer.nextToken();
|
||||||
|
fieldsToLoad.add(s);
|
||||||
|
}
|
||||||
|
fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue