Applied 788 and 790 from Doron Cohen. Ran both the micro-standard and the task runs and results look reasonable.

Thanks, Doron

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@506093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-02-11 18:59:22 +00:00
parent 881ab33b81
commit bb66099414
9 changed files with 75 additions and 259 deletions

View File

@ -8,4 +8,7 @@ $Id:$
1. Committed Doron Cohen's benchmarking contribution, which provides an easily expandable task based approach to benchmarking. See the javadocs for information. (Doron Cohen via Grant Ingersoll)
2. Added this file.
2. Added this file.
3. 2/11/07: LUCENE-790 and 788: Fixed Locale issue with date formatter. Fixed some minor issues with benchmarking by task. Added a dependency
on the Lucene demo to the build classpath. (Doron Cohen, Grant Ingersoll)

View File

@ -8,6 +8,7 @@
<import file="../contrib-build.xml"/>
<property name="working.dir" value="work"/>
<target name="check-files">
<available file="temp/news20.tar.gz" property="news20.exists"/>
@ -84,8 +85,11 @@
<property name="collections.jar" value="commons-collections-3.1.jar"/>
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
<property name="lucene-demos.jar" location="${common.dir}/build/lucene-demos-${version}.jar"/>
<path id="classpath">
<pathelement path="${lucene.jar}"/>
<pathelement path="${lucene-demos.jar}"/>
<pathelement path="${basedir}/lib/${digester.jar}"/>
<pathelement path="${basedir}/lib/${collections.jar}"/>
<pathelement path="${basedir}/lib/${logging.jar}"/>

View File

@ -27,8 +27,17 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
* Run the benchmark algorithm.
* <p>Usage: java Benchmark algorithm-file
* <ol>
* <li>Read algorithm.
* <li> Run the algorithm.
* <li>Read algorithm.</li>
* <li> Run the algorithm.</li>
* </ol>
* Things to be added/fixed in "Benchmarking by tasks":
* <ol>
* <li>TODO - report into Excel and/or graphed view.</li>
* <li>TODO - perf comparison between Lucene releases over the years.</li>
* <li>TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)</li>
* <li>TODO - add overall time control for repeated execution (vs. current by-count only).</li>
* <li>TODO - query maker that is based on index statistics.</li>
* <li>TODO - prpoerties documentation - each task should document the properties it relies on.</li>
* </ol>
*/
public class Benchmark {

View File

@ -23,24 +23,21 @@ import java.io.FileReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config;
/**
* A DocMaker using the Reuters collection for its input.
*/
public class ReutersDocMaker extends SimpleDocMaker {
public class ReutersDocMaker extends BasicDocMaker {
private DateFormat dateFormat;
private File dataDir = null;
private ArrayList txtFiles = new ArrayList();
private ArrayList inputFiles = new ArrayList();
private int nextFile = 0;
private int round=0;
private int count = 0;
private int iteration=0;
/* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties)
@ -49,48 +46,28 @@ public class ReutersDocMaker extends SimpleDocMaker {
super.setConfig(config);
String d = config.get("docs.dir","reuters-out");
dataDir = new File(new File("work"),d);
addFiles(dataDir);
if (txtFiles.size()==0) {
collectFiles(dataDir,inputFiles);
if (inputFiles.size()==0) {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
}
// date format: 30-MAR-1987 14:22:36.87
dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS");
dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
dateFormat.setLenient(true);
}
private void addFiles(File f) {
if (!f.canRead()) {
return;
}
if (f.isDirectory()) {
File files[] = f.listFiles();
for (int i = 0; i < files.length; i++) {
addFiles(files[i]);
}
return;
}
txtFiles.add(f);
addUniqueBytes(f.length());
}
/* (non-Javadoc)
* @see SimpleDocMaker#makeDocument()
*/
public Document makeDocument() throws Exception {
protected DocData getNextDocData() throws Exception {
File f = null;
String name = null;
synchronized (this) {
f = (File) txtFiles.get(nextFile++);
name = f.getCanonicalPath()+"_"+round;
if (nextFile >= txtFiles.size()) {
f = (File) inputFiles.get(nextFile++);
name = f.getCanonicalPath()+"_"+iteration;
if (nextFile >= inputFiles.size()) {
// exhausted files, start a new round
nextFile = 0;
round++;
iteration++;
}
}
Document doc = new Document();
doc.add(new Field("name",name,storeVal,indexVal,termVecVal));
BufferedReader reader = new BufferedReader(new FileReader(f));
String line = null;
//First line is the date, 3rd is the title, rest is body
@ -98,27 +75,23 @@ public class ReutersDocMaker extends SimpleDocMaker {
reader.readLine();//skip an empty line
String title = reader.readLine();
reader.readLine();//skip an empty line
StringBuffer body = new StringBuffer(1024);
StringBuffer bodyBuf = new StringBuffer(1024);
while ((line = reader.readLine()) != null) {
body.append(line).append(' ');
bodyBuf.append(line).append(' ');
}
Date date = dateFormat.parse(dateStr.trim());
doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND),
Field.Store.YES, Field.Index.UN_TOKENIZED));
if (title != null) {
doc.add(new Field("title", title, storeVal,indexVal,termVecVal));
}
if (body.length() > 0) {
doc.add(new Field("body", body.toString(), storeVal,indexVal,termVecVal));
}
count++;
addBytes(f.length());
return doc;
DocData dd = new DocData();
dd.date = dateFormat.parse(dateStr.trim());
dd.name = name;
dd.title = title;
dd.body = bodyBuf.toString();
return dd;
}
/*
* (non-Javadoc)
* @see DocMaker#resetIinputs()
@ -126,8 +99,7 @@ public class ReutersDocMaker extends SimpleDocMaker {
public synchronized void resetInputs() {
super.resetInputs();
nextFile = 0;
round = 0;
count = 0;
iteration = 0;
}
/*
@ -135,22 +107,7 @@ public class ReutersDocMaker extends SimpleDocMaker {
* @see DocMaker#numUniqueTexts()
*/
public int numUniqueTexts() {
return txtFiles.size();
return inputFiles.size();
}
/*
* (non-Javadoc)
* @see DocMaker#getCount()
*/
public int getCount() {
return count;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
*/
public Document makeDocument(int size) throws Exception {
throw new Exception(this+".makeDocument (int size) is not supported!");
}
}

View File

@ -17,29 +17,13 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
/**
* Create documents for the test
*/
public class SimpleDocMaker implements DocMaker {
public class SimpleDocMaker extends BasicDocMaker {
static final String BODY_FIELD = "body";
private int docID = 0;
private long numBytes = 0;
private long numUniqueBytes = 0;
protected Config config;
private int nextDocTextPosition = 0; // for creating docs of fixed size.
protected Field.Store storeVal = Field.Store.NO;
protected Field.Index indexVal = Field.Index.TOKENIZED;
protected Field.TermVector termVecVal = Field.TermVector.NO;
static final String DOC_TEXT = // from a public first aid info at http://firstaid.ie.eu.org
"Well it may be a little dramatic but sometimes it true. " +
"If you call the emergency medical services to an incident, " +
@ -52,100 +36,18 @@ public class SimpleDocMaker implements DocMaker {
"ones and the stranger whose life may depend on you being in the " +
"right place at the right time with the right knowledge.";
private static int DOC_TEXT_LENGTH = DOC_TEXT.length();
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
*/
public Document makeDocument () throws Exception {
return makeDocument(0);
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
*/
public Document makeDocument(int size) throws Exception {
int docid = newdocid();
Document doc = new Document();
doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
String docText = createDocText(size);
doc.add(new Field(BODY_FIELD, "synthetic body text"+docid+" "+docText, storeVal, indexVal, termVecVal));
addBytes(docText.length()); // should multiply by 2 here?
return doc;
}
private synchronized int[] nextDocText(int fixedDocSize) {
int from = nextDocTextPosition;
int to = nextDocTextPosition;
int wraps = 0;
int size = 0;
while (size<fixedDocSize) {
int added = DOC_TEXT_LENGTH - to;
if (size+added <= fixedDocSize) {
to = 0;
size += added;
wraps ++;
} else {
added = fixedDocSize - size;
size += added;
to += added;
}
}
nextDocTextPosition = to;
return new int[]{from,to,wraps};
}
private String createDocText(int fixedDocSize) {
if (fixedDocSize<=0) {
//no fixed doc size requirement
return DOC_TEXT;
}
// create a document wit fixed doc size
int fromToWraps[] = nextDocText(fixedDocSize);
int from = fromToWraps[0];
int to = fromToWraps[1];
int wraps = fromToWraps[2];
StringBuffer sb = new StringBuffer();
while (wraps-- > 0) {
sb.append(DOC_TEXT.substring(from));
from = 0;
}
sb.append(DOC_TEXT.substring(from,to));
return sb.toString();
}
// return a new docid
private synchronized int newdocid() {
return docID++;
}
/* (non-Javadoc)
* @see DocMaker#setConfig(java.util.Properties)
*/
public void setConfig(Config config) {
this.config = config;
boolean stored = config.get("doc.stored",false);
boolean tokenized = config.get("doc.tokenized",true);
boolean termVec = config.get("doc.term.vector",false);
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
}
/*
* (non-Javadoc)
* @see DocMaker#resetIinputs()
*/
public synchronized void resetInputs() {
printDocStatistics();
super.resetInputs();
docID = 0;
numBytes = 0;
}
/*
@ -156,72 +58,12 @@ public class SimpleDocMaker implements DocMaker {
return 0; // not applicable
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
*/
public long numUniqueBytes() {
return numUniqueBytes;
}
/*
* (non-Javadoc)
* @see DocMaker#getCount()
*/
public int getCount() {
return docID;
}
/*
* (non-Javadoc)
* @see DocMaker#getByteCount()
*/
public long getByteCount() {
return numBytes;
}
protected void addUniqueBytes (long n) {
numUniqueBytes += n;
}
protected void addBytes (long n) {
numBytes += n;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
*/
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
private int printNum = 0;
public void printDocStatistics() {
boolean print = false;
String col = " ";
StringBuffer sb = new StringBuffer();
String newline = System.getProperty("line.separator");
sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
int nut = numUniqueTexts();
if (nut > lastPrintedNumUniqueTexts) {
print = true;
sb.append("total bytes of unique texts: ").append(Format.format(0,nut,col)).append(newline);
lastPrintedNumUniqueTexts = nut;
}
long nub = numUniqueBytes();
if (nub > lastPrintedNumUniqueBytes) {
print = true;
sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
lastPrintedNumUniqueBytes = nub;
}
if (getCount()>0) {
print = true;
sb.append("num files added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline);
sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
}
if (print) {
System.out.println(sb.append(newline).toString());
printNum++;
}
protected DocData getNextDocData() {
DocData dd = new DocData();
dd.body = DOC_TEXT;
dd.name = "doc"+newdocid();
addBytes(DOC_TEXT.length());
return dd;
}
}

View File

@ -44,7 +44,7 @@ Contained packages:
</tr>
<tr>
<td><a href="feeds/package-summary.html">feeds</a></td>
<td>Sources foe benchmark inputs: documents and queries.</td>
<td>Sources for benchmark inputs: documents and queries.</td>
</tr>
<tr>
<td><a href="utils/package-summary.html">utils</a></td>
@ -92,7 +92,7 @@ Easiest way to run a benchmarks is using the predefined ant task:
<br>- would run the <code>compound-penalty.alg</code> "algorithm".
</li>
<li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
<br>- would run the <code>your perf test</code> "algorithm".
<br>- would run <code>your perf test</code> "algorithm".
</li>
<li>java org.apache.lucene.benchmark.byTask.programmatic.Sample
<br>- would run a performance test programmatically - without using an alg file.
@ -109,7 +109,7 @@ otherwise, you can extend the framework to meet your needs, as explained herein.
<p>
Each benchmark run has a DocMaker and a QueryMaker. These two should usually match, so
that "meaningful" queries are used for a certain collection.
Properties defined at the header of the alg file define which "makers" should be used.
Properties set at the header of the alg file define which "makers" should be used.
You can also specify your own makers, implementing the DocMaker and QureyMaker interfaces.
</p>
@ -275,8 +275,8 @@ regular index/search work tasks, report tasks, and control tasks.
<br>This increments a global "round counter". All task runs that would start now would
record the new, updated round counter as their round number. This would appear in reports.
In particular, see <font color="#FF0066">RepSumByNameRound</font> above.
<br>An additional effect of NewRound, is that numeric and boolean properties defined in the
.properties file as a sequence of values, e.g. <font color="#FF0066">merge.factor=mrg:10:100:10:100</font> would
<br>An additional effect of NewRound, is that numeric and boolean properties defined (at the head
of the .alg file) as a sequence of values, e.g. <font color="#FF0066">merge.factor=mrg:10:100:10:100</font> would
increment (cyclic) to the next value.
Note: this would also be reflected in the reports, in this case under a column that would be named "mrg".
</li>
@ -368,7 +368,7 @@ Some of the currently defined properties are:
(Make sure it is no shorter than any value in the sequence).
<ul>
<li><font color="#FF0066">max.buffered</font>
<br>Example: buffered=buf.10.10.100.100 -
<br>Example: max.buffered=buf:10:10:100:100 -
this would define using maxBufferedDocs of 10 in iterations 0 and 1,
and 100 in iterations 2 and 3.
</li>

View File

@ -51,6 +51,10 @@ public class Sample {
// task to report
RepSumByNameTask rep = new RepSumByNameTask(runData);
top.addTask(rep);
// print algorithm
System.out.println(top.toString());
// execute
top.doLogic();
}

View File

@ -210,6 +210,12 @@ public class TaskSequence extends PerfTask {
*/
public void setNoChildReport() {
letChildReport = false;
for (Iterator it = tasks.iterator(); it.hasNext();) {
PerfTask task = (PerfTask) it.next();
if (task instanceof TaskSequence) {
((TaskSequence)task).setNoChildReport();
}
}
}
/**

View File

@ -1,19 +1,5 @@
package org.apache.lucene.benchmark.standard;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.benchmark.AbstractBenchmarker;
@ -33,6 +19,11 @@ import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.FSDirectory;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* Copyright 2005 The Apache Software Foundation
*
@ -66,7 +57,7 @@ public class StandardBenchmarker extends AbstractBenchmarker implements Benchmar
public static final String INDEX_DIR = "index";
//30-MAR-1987 14:22:36.87
private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS");
private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
//DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT);
static{
format.setLenient(true);