LUCENE-1139: various additions/fixes to contrib/benchmark

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@613536 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-01-20 11:31:38 +00:00
parent 7f4bdd5a80
commit 354a3175d5
8 changed files with 328 additions and 65 deletions

View File

@ -4,6 +4,14 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
01/20/08
LUCENE-1139: various fixes
- add merge.scheduler, merge.policy config properties
- refactor Open/CreateIndexTask to share setting config on IndexWriter
- added doc.reuse.fields=true|false for LineDocMaker
- OptimizeTask now takes int param to call optimize(int maxNumSegments)
- CloseIndexTask now takes bool param to call close(false) (abort running merges)
01/03/08
LUCENE-1116: quality package improvements:
- add MRR computation;

View File

@ -37,6 +37,7 @@ import java.io.InputStreamReader;
*
* Config properties:
* docs.file=<path to the file%gt;
* doc.reuse.fields=true|false (default true)
*/
public class LineDocMaker extends BasicDocMaker {
@ -46,6 +47,9 @@ public class LineDocMaker extends BasicDocMaker {
private String fileName;
private static int READER_BUFFER_BYTES = 64*1024;
private final DocState localDocState = new DocState();
private boolean doReuseFields = true;
class DocState {
Document doc;
@ -84,20 +88,47 @@ public class LineDocMaker extends BasicDocMaker {
public Document setFields(String line) {
// title <TAB> date <TAB> body <NEWLINE>
final String title, date, body;
int spot = line.indexOf(SEP);
if (spot != -1) {
titleField.setValue(line.substring(0, spot));
title = line.substring(0, spot);
int spot2 = line.indexOf(SEP, 1+spot);
if (spot2 != -1) {
dateField.setValue(line.substring(1+spot, spot2));
bodyField.setValue(line.substring(1+spot2, line.length()));
} else {
dateField.setValue("");
bodyField.setValue("");
}
date = line.substring(1+spot, spot2);
body = line.substring(1+spot2, line.length());
} else
date = body = "";
} else
titleField.setValue("");
return doc;
title = date = body = "";
if (doReuseFields) {
titleField.setValue(title);
dateField.setValue(date);
bodyField.setValue(body);
return doc;
} else {
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
title,
storeVal,
Field.Index.TOKENIZED,
termVecVal);
Field localBodyField = new Field(BasicDocMaker.BODY_FIELD,
body,
storeVal,
Field.Index.TOKENIZED,
termVecVal);
Field localDateField = new Field(BasicDocMaker.BODY_FIELD,
date,
storeVal,
Field.Index.TOKENIZED,
termVecVal);
Document localDoc = new Document();
localDoc.add(localBodyField);
localDoc.add(localTitleField);
localDoc.add(localDateField);
return localDoc;
}
}
}
@ -131,7 +162,10 @@ public class LineDocMaker extends BasicDocMaker {
}
}
return getDocState().setFields(line);
if (doReuseFields)
return getDocState().setFields(line);
else
return localDocState.setFields(line);
}
public Document makeDocument(int size) throws Exception {
@ -146,6 +180,11 @@ public class LineDocMaker extends BasicDocMaker {
openFile();
}
public void setConfig(Config config) {
super.setConfig(config);
doReuseFields = config.get("doc.reuse.fields", true);
}
synchronized void openFile() {
try {
if (fileIn != null)

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexWriter;
/**
* Close index writer.
* <br>Other side effects: index writer object in perfRunData is nullified.
* <br>Takes optional param "doWait": if false, then close(false) is called.
*/
public class CloseIndexTask extends PerfTask {
@ -32,13 +33,23 @@ public class CloseIndexTask extends PerfTask {
super(runData);
}
boolean doWait = true;
public int doLogic() throws IOException {
IndexWriter iw = getRunData().getIndexWriter();
if (iw!=null) {
iw.close();
iw.close(doWait);
}
getRunData().setIndexWriter(null);
return 1;
}
public void setParams(String params) {
super.setParams(params);
doWait = Boolean.valueOf(params).booleanValue();
}
public boolean supportsParams() {
return true;
}
}

View File

@ -17,11 +17,11 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.MergePolicy;
import java.io.IOException;
@ -39,28 +39,67 @@ public class CreateIndexTask extends PerfTask {
super(runData);
}
public int doLogic() throws IOException {
Directory dir = getRunData().getDirectory();
Analyzer analyzer = getRunData().getAnalyzer();
Config config = getRunData().getConfig();
boolean cmpnd = config.get("compound",true);
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
public static void setIndexWriterConfig(IndexWriter writer, Config config) throws IOException {
writer.setUseCompoundFile(config.get("compound",true));
writer.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
writer.setMaxFieldLength(config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH));
IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
iw.setMaxFieldLength(mxfl);
iw.setRAMBufferSizeMB(flushAtRAMUsage);
iw.setMaxBufferedDocs(mxbf);
getRunData().setIndexWriter(iw);
return 1;
final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
final int maxBuffered = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
if (maxBuffered == IndexWriter.DISABLE_AUTO_FLUSH) {
writer.setRAMBufferSizeMB(ramBuffer);
writer.setMaxBufferedDocs(maxBuffered);
} else {
writer.setMaxBufferedDocs(maxBuffered);
writer.setRAMBufferSizeMB(ramBuffer);
}
final String mergeScheduler = config.get("merge.scheduler",
"org.apache.lucene.index.ConcurrentMergeScheduler");
RuntimeException err = null;
try {
writer.setMergeScheduler((MergeScheduler) Class.forName(mergeScheduler).newInstance());
} catch (IllegalAccessException iae) {
err = new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler");
err.initCause(iae);
} catch (InstantiationException ie) {
err = new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler");
err.initCause(ie);
} catch (ClassNotFoundException cnfe) {
err = new RuntimeException("unable to load class '" + mergeScheduler + "' as merge scheduler");
err.initCause(cnfe);
}
if (err != null)
throw err;
final String mergePolicy = config.get("merge.policy",
"org.apache.lucene.index.LogByteSizeMergePolicy");
err = null;
try {
writer.setMergePolicy((MergePolicy) Class.forName(mergePolicy).newInstance());
} catch (IllegalAccessException iae) {
err = new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy");
err.initCause(iae);
} catch (InstantiationException ie) {
err = new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy");
err.initCause(ie);
} catch (ClassNotFoundException cnfe) {
err = new RuntimeException("unable to load class '" + mergePolicy + "' as merge policy");
err.initCause(cnfe);
}
if (err != null)
throw err;
}
public int doLogic() throws IOException {
PerfRunData runData = getRunData();
Config config = runData.getConfig();
IndexWriter writer = new IndexWriter(runData.getDirectory(),
runData.getConfig().get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT),
runData.getAnalyzer(),
true);
CreateIndexTask.setIndexWriterConfig(writer, config);
runData.setIndexWriter(writer);
return 1;
}
}

View File

@ -17,12 +17,10 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import java.io.IOException;
@ -47,30 +45,14 @@ public class OpenIndexTask extends PerfTask {
}
public int doLogic() throws IOException {
Directory dir = getRunData().getDirectory();
Analyzer analyzer = getRunData().getAnalyzer();
Config config = getRunData().getConfig();
boolean cmpnd = config.get("compound",true);
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
double flushAtRAMUsage = config.get("ram.flush.mb", DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", DEFAULT_AUTO_COMMIT);
IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
// must update params for newly opened writer
writer.setRAMBufferSizeMB(flushAtRAMUsage);
writer.setMaxBufferedDocs(mxbf);
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?
if (flushAtRAMUsage > 0)
writer.setRAMBufferSizeMB(flushAtRAMUsage);
getRunData().setIndexWriter(writer);
PerfRunData runData = getRunData();
Config config = runData.getConfig();
IndexWriter writer = new IndexWriter(runData.getDirectory(),
config.get("autocommit", DEFAULT_AUTO_COMMIT),
runData.getAnalyzer(),
false);
CreateIndexTask.setIndexWriterConfig(writer, config);
runData.setIndexWriter(writer);
return 1;
}
}

View File

@ -30,11 +30,21 @@ public class OptimizeTask extends PerfTask {
super(runData);
}
int maxNumSegments = 1;
public int doLogic() throws Exception {
IndexWriter iw = getRunData().getIndexWriter();
iw.optimize();
iw.optimize(maxNumSegments);
//System.out.println("optimize called");
return 1;
}
public void setParams(String params) {
super.setParams(params);
maxNumSegments = (int) Double.valueOf(params).intValue();
}
public boolean supportsParams() {
return true;
}
}

View File

@ -186,10 +186,10 @@ public class TaskSequence extends PerfTask {
final PerfTask task = (PerfTask) tasksArray[i].clone();
t[indx++] = new Thread() {
public void run() {
int n;
try {
n = task.runAndMaybeStats(letChildReport);
updateExhausted(task);
int n = task.runAndMaybeStats(letChildReport);
if (anyExhaustibleTasks)
updateExhausted(task);
synchronized (count) {
count[0] += n;
}

View File

@ -24,7 +24,6 @@ import java.io.BufferedReader;
import java.util.List;
import java.util.Iterator;
import org.apache.lucene.benchmark.byTask.Benchmark;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
@ -34,6 +33,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.LogDocMergePolicy;
import junit.framework.TestCase;
@ -205,6 +206,8 @@ public class TestPerfTasksLogic extends TestCase {
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
"doc.maker.forever=false",
"doc.maker.forever=false",
"doc.reuse.fields=false",
"autocommit=false",
"ram.flush.mb=4",
"# ----- alg ",
@ -393,4 +396,175 @@ public class TestPerfTasksLogic extends TestCase {
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/**
* Test that we can close IndexWriter with argument "false".
*/
public void testCloseIndexFalse() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker="+Reuters20DocMaker.class.getName(),
"ram.flush.mb=-1",
"max.buffered=2",
"doc.add.log.step=3",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" CloseIndex(false)",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
public static class MyMergeScheduler extends SerialMergeScheduler {
boolean called;
public MyMergeScheduler() {
super();
called = true;
}
}
/**
* Test that we can set merge scheduler".
*/
public void testMergeScheduler() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker="+Reuters20DocMaker.class.getName(),
"doc.add.log.step=3",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=RAMDirectory",
"merge.scheduler=" + MyMergeScheduler.class.getName(),
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue("did not use the specified MergeScheduler", ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getMergeScheduler()).called);
benchmark.getRunData().getIndexWriter().close();
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
public static class MyMergePolicy extends LogDocMergePolicy {
boolean called;
public MyMergePolicy() {
super();
called = true;
}
}
/**
* Test that we can set merge policy".
*/
public void testMergePolicy() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker="+Reuters20DocMaker.class.getName(),
"doc.add.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=RAMDirectory",
"merge.policy=" + MyMergePolicy.class.getName(),
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue("did not use the specified MergeScheduler", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getMergePolicy()).called);
benchmark.getRunData().getIndexWriter().close();
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/**
* Test that we can call optimize(maxNumSegments).
*/
public void testOptimizeMaxNumSegments() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker="+Reuters20DocMaker.class.getName(),
"doc.add.log.step=3",
"ram.flush.mb=-1",
"max.buffered=3",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=RAMDirectory",
"merge.policy=org.apache.lucene.index.LogDocMergePolicy",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" Optimize(3)",
" CloseIndex()",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
// Make sure we have 3 segments:
final String[] files = benchmark.getRunData().getDirectory().list();
int cfsCount = 0;
for(int i=0;i<files.length;i++)
if (files[i].endsWith(".cfs"))
cfsCount++;
assertEquals(3, cfsCount);
}
}