diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 081cb3f38e4..9f8c29f5977 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,14 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +01/20/08 + LUCENE-1139: various fixes + - add merge.scheduler, merge.policy config properties + - refactor Open/CreateIndexTask to share setting config on IndexWriter + - added doc.reuse.fields=true|false for LineDocMaker + - OptimizeTask now takes int param to call optimize(int maxNumSegments) + - CloseIndexTask now takes bool param to call close(false) (abort running merges) + 01/03/08 LUCENE-1116: quality package improvements: - add MRR computation; diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java index 246a228e095..4acc98ccb7c 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -37,6 +37,7 @@ import java.io.InputStreamReader; * * Config properties: * docs.file=<path to the file%gt; + * doc.reuse.fields=true|false (default true) */ public class LineDocMaker extends BasicDocMaker { @@ -46,6 +47,9 @@ public class LineDocMaker extends BasicDocMaker { private String fileName; private static int READER_BUFFER_BYTES = 64*1024; + private final DocState localDocState = new DocState(); + + private boolean doReuseFields = true; class DocState { Document doc; @@ -84,20 +88,47 @@ public class LineDocMaker extends BasicDocMaker { public Document setFields(String line) { // title date body + final String title, date, body; + int spot = line.indexOf(SEP); if (spot != -1) { - titleField.setValue(line.substring(0, spot)); + title = line.substring(0, spot); int spot2 = line.indexOf(SEP, 1+spot); if (spot2 != -1) { - dateField.setValue(line.substring(1+spot, spot2)); - bodyField.setValue(line.substring(1+spot2, line.length())); - } else { - dateField.setValue(""); - bodyField.setValue(""); - } + date = line.substring(1+spot, spot2); + body = line.substring(1+spot2, line.length()); + } else + date = body = ""; } else - titleField.setValue(""); - return doc; + title = date = body = ""; + + if (doReuseFields) { + titleField.setValue(title); + dateField.setValue(date); + bodyField.setValue(body); + return doc; + } else { + Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, + title, + storeVal, + Field.Index.TOKENIZED, + termVecVal); + Field localBodyField = new Field(BasicDocMaker.BODY_FIELD, + body, + storeVal, + Field.Index.TOKENIZED, + termVecVal); + Field localDateField = new Field(BasicDocMaker.BODY_FIELD, + date, + storeVal, + Field.Index.TOKENIZED, + termVecVal); + Document localDoc = new Document(); + localDoc.add(localBodyField); + localDoc.add(localTitleField); + localDoc.add(localDateField); + return localDoc; + } } } @@ -131,7 +162,10 @@ public class LineDocMaker extends BasicDocMaker { } } - return getDocState().setFields(line); + if (doReuseFields) + return getDocState().setFields(line); + else + return localDocState.setFields(line); } public Document makeDocument(int size) throws Exception { @@ -146,6 +180,11 @@ public class LineDocMaker extends BasicDocMaker { openFile(); } + public void setConfig(Config config) { + super.setConfig(config); + doReuseFields = config.get("doc.reuse.fields", true); + } + synchronized void openFile() { try { if (fileIn != null) diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.java index ba378bdb811..0a0a2f6d401 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexWriter; /** * Close index writer. *
Other side effects: index writer object in perfRunData is nullified. + *
Takes optional param "doWait": if false, then close(false) is called. */ public class CloseIndexTask extends PerfTask { @@ -32,13 +33,23 @@ public class CloseIndexTask extends PerfTask { super(runData); } + boolean doWait = true; + public int doLogic() throws IOException { IndexWriter iw = getRunData().getIndexWriter(); if (iw!=null) { - iw.close(); + iw.close(doWait); } getRunData().setIndexWriter(null); return 1; } + public void setParams(String params) { + super.setParams(params); + doWait = Boolean.valueOf(params).booleanValue(); + } + + public boolean supportsParams() { + return true; + } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 2386049678a..9997b399577 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -17,11 +17,11 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.store.Directory; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergePolicy; import java.io.IOException; @@ -39,28 +39,67 @@ public class CreateIndexTask extends PerfTask { super(runData); } - public int doLogic() throws IOException { - Directory dir = getRunData().getDirectory(); - Analyzer analyzer = getRunData().getAnalyzer(); - - Config config = getRunData().getConfig(); - - boolean cmpnd = config.get("compound",true); - int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR); - int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); - int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH); - double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB); - boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); + public static void setIndexWriterConfig(IndexWriter writer, Config config) throws IOException { + writer.setUseCompoundFile(config.get("compound",true)); + writer.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR)); + writer.setMaxFieldLength(config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH)); - IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true); - - iw.setUseCompoundFile(cmpnd); - iw.setMergeFactor(mrgf); - iw.setMaxFieldLength(mxfl); - iw.setRAMBufferSizeMB(flushAtRAMUsage); - iw.setMaxBufferedDocs(mxbf); - getRunData().setIndexWriter(iw); - return 1; + final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB); + final int maxBuffered = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); + if (maxBuffered == IndexWriter.DISABLE_AUTO_FLUSH) { + writer.setRAMBufferSizeMB(ramBuffer); + writer.setMaxBufferedDocs(maxBuffered); + } else { + writer.setMaxBufferedDocs(maxBuffered); + writer.setRAMBufferSizeMB(ramBuffer); + } + + final String mergeScheduler = config.get("merge.scheduler", + "org.apache.lucene.index.ConcurrentMergeScheduler"); + RuntimeException err = null; + try { + writer.setMergeScheduler((MergeScheduler) Class.forName(mergeScheduler).newInstance()); + } catch (IllegalAccessException iae) { + err = new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler"); + err.initCause(iae); + } catch (InstantiationException ie) { + err = new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler"); + err.initCause(ie); + } catch (ClassNotFoundException cnfe) { + err = new RuntimeException("unable to load class '" + mergeScheduler + "' as merge scheduler"); + err.initCause(cnfe); + } + if (err != null) + throw err; + + final String mergePolicy = config.get("merge.policy", + "org.apache.lucene.index.LogByteSizeMergePolicy"); + err = null; + try { + writer.setMergePolicy((MergePolicy) Class.forName(mergePolicy).newInstance()); + } catch (IllegalAccessException iae) { + err = new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy"); + err.initCause(iae); + } catch (InstantiationException ie) { + err = new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy"); + err.initCause(ie); + } catch (ClassNotFoundException cnfe) { + err = new RuntimeException("unable to load class '" + mergePolicy + "' as merge policy"); + err.initCause(cnfe); + } + if (err != null) + throw err; } + public int doLogic() throws IOException { + PerfRunData runData = getRunData(); + Config config = runData.getConfig(); + IndexWriter writer = new IndexWriter(runData.getDirectory(), + runData.getConfig().get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT), + runData.getAnalyzer(), + true); + CreateIndexTask.setIndexWriterConfig(writer, config); + runData.setIndexWriter(writer); + return 1; + } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java index e650fa3f93a..84fd99c40e2 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java @@ -17,12 +17,10 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LogMergePolicy; -import org.apache.lucene.store.Directory; import java.io.IOException; @@ -47,30 +45,14 @@ public class OpenIndexTask extends PerfTask { } public int doLogic() throws IOException { - Directory dir = getRunData().getDirectory(); - Analyzer analyzer = getRunData().getAnalyzer(); - - Config config = getRunData().getConfig(); - - boolean cmpnd = config.get("compound",true); - int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR); - int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED); - int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH); - double flushAtRAMUsage = config.get("ram.flush.mb", DEFAULT_RAM_FLUSH_MB); - boolean autoCommit = config.get("autocommit", DEFAULT_AUTO_COMMIT); - IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false); - - // must update params for newly opened writer - writer.setRAMBufferSizeMB(flushAtRAMUsage); - writer.setMaxBufferedDocs(mxbf); - writer.setMaxFieldLength(mxfl); - writer.setMergeFactor(mrgf); - writer.setUseCompoundFile(cmpnd); // this one redundant? - if (flushAtRAMUsage > 0) - writer.setRAMBufferSizeMB(flushAtRAMUsage); - - getRunData().setIndexWriter(writer); + PerfRunData runData = getRunData(); + Config config = runData.getConfig(); + IndexWriter writer = new IndexWriter(runData.getDirectory(), + config.get("autocommit", DEFAULT_AUTO_COMMIT), + runData.getAnalyzer(), + false); + CreateIndexTask.setIndexWriterConfig(writer, config); + runData.setIndexWriter(writer); return 1; } - } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OptimizeTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OptimizeTask.java index c6a459d23bb..94ca7d5c48c 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OptimizeTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OptimizeTask.java @@ -30,11 +30,21 @@ public class OptimizeTask extends PerfTask { super(runData); } + int maxNumSegments = 1; + public int doLogic() throws Exception { IndexWriter iw = getRunData().getIndexWriter(); - iw.optimize(); + iw.optimize(maxNumSegments); //System.out.println("optimize called"); return 1; } + public void setParams(String params) { + super.setParams(params); + maxNumSegments = (int) Double.valueOf(params).intValue(); + } + + public boolean supportsParams() { + return true; + } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java index 21e22a34533..829f4768991 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java @@ -186,10 +186,10 @@ public class TaskSequence extends PerfTask { final PerfTask task = (PerfTask) tasksArray[i].clone(); t[indx++] = new Thread() { public void run() { - int n; try { - n = task.runAndMaybeStats(letChildReport); - updateExhausted(task); + int n = task.runAndMaybeStats(letChildReport); + if (anyExhaustibleTasks) + updateExhausted(task); synchronized (count) { count[0] += n; } diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 7d98e316483..bd8d42d1333 100755 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -24,7 +24,6 @@ import java.io.BufferedReader; import java.util.List; import java.util.Iterator; -import org.apache.lucene.benchmark.byTask.Benchmark; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; @@ -34,6 +33,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.index.LogDocMergePolicy; import junit.framework.TestCase; @@ -205,6 +206,8 @@ public class TestPerfTasksLogic extends TestCase { "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker", "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'), "doc.maker.forever=false", + "doc.maker.forever=false", + "doc.reuse.fields=false", "autocommit=false", "ram.flush.mb=4", "# ----- alg ", @@ -393,4 +396,175 @@ public class TestPerfTasksLogic extends TestCase { assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } + + /** + * Test that we can close IndexWriter with argument "false". + */ + public void testCloseIndexFalse() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "# ----- properties ", + "doc.maker="+Reuters20DocMaker.class.getName(), + "ram.flush.mb=-1", + "max.buffered=2", + "doc.add.log.step=3", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=RAMDirectory", + "doc.stored=false", + "doc.tokenized=false", + "debug.level=1", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + " CloseIndex(false)", + "} : 2", + }; + + // 2. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + // 3. test number of docs in the index + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); + ir.close(); + } + + public static class MyMergeScheduler extends SerialMergeScheduler { + boolean called; + public MyMergeScheduler() { + super(); + called = true; + } + } + + /** + * Test that we can set merge scheduler". + */ + public void testMergeScheduler() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "# ----- properties ", + "doc.maker="+Reuters20DocMaker.class.getName(), + "doc.add.log.step=3", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=RAMDirectory", + "merge.scheduler=" + MyMergeScheduler.class.getName(), + "doc.stored=false", + "doc.tokenized=false", + "debug.level=1", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + "} : 2", + }; + // 2. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + assertTrue("did not use the specified MergeScheduler", ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getMergeScheduler()).called); + benchmark.getRunData().getIndexWriter().close(); + + // 3. test number of docs in the index + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); + ir.close(); + } + + public static class MyMergePolicy extends LogDocMergePolicy { + boolean called; + public MyMergePolicy() { + super(); + called = true; + } + } + /** + * Test that we can set merge policy". + */ + public void testMergePolicy() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "# ----- properties ", + "doc.maker="+Reuters20DocMaker.class.getName(), + "doc.add.log.step=3", + "ram.flush.mb=-1", + "max.buffered=2", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=RAMDirectory", + "merge.policy=" + MyMergePolicy.class.getName(), + "doc.stored=false", + "doc.tokenized=false", + "debug.level=1", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + "} : 2", + }; + + // 2. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + assertTrue("did not use the specified MergeScheduler", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getMergePolicy()).called); + benchmark.getRunData().getIndexWriter().close(); + + // 3. test number of docs in the index + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); + ir.close(); + } + + /** + * Test that we can call optimize(maxNumSegments). + */ + public void testOptimizeMaxNumSegments() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "# ----- properties ", + "doc.maker="+Reuters20DocMaker.class.getName(), + "doc.add.log.step=3", + "ram.flush.mb=-1", + "max.buffered=3", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=RAMDirectory", + "merge.policy=org.apache.lucene.index.LogDocMergePolicy", + "doc.stored=false", + "doc.tokenized=false", + "debug.level=1", + "# ----- alg ", + "{ \"Rounds\"", + " ResetSystemErase", + " CreateIndex", + " { \"AddDocs\" AddDoc > : * ", + " Optimize(3)", + " CloseIndex()", + "} : 2", + }; + + // 2. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + // 3. test number of docs in the index + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); + ir.close(); + + // Make sure we have 3 segments: + final String[] files = benchmark.getRunData().getDirectory().list(); + int cfsCount = 0; + for(int i=0;i