LUCENE-994: change defaults in IndexWriter to maximize 'out of the box' indexing speed

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@579360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2007-09-25 20:02:07 +00:00
parent 511406ecbe
commit a28eb4d978
13 changed files with 135 additions and 48 deletions

View File

@ -6,6 +6,25 @@ $Id$
Changes in runtime behavior
1. LUCENE-994: Defaults for IndexWriter have been changed to maximize
out-of-the-box indexing speed. First, IndexWriter now flushes by
RAM usage (16 MB by default) instead of a fixed doc count (call
IndexWriter.setMaxBufferedDocs to get backwards compatible
behavior). Second, ConcurrentMergeScheduler is used to run merges
using background threads (call IndexWriter.setMergeScheduler(new
SerialMergeScheduler()) to get backwards compatible behavior).
Third, merges are chosen based on size in bytes of each segment
rather than document count of each segment (call
IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get
backwards compatible behavior).
NOTE: users of ParallelReader must change back all of these
defaults in order to ensure the docIDs "align" across all parallel
indices.
(Mike McCandless)
API Changes
1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have

View File

@ -49,18 +49,20 @@ public class CreateIndexTask extends PerfTask {
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
iw.setMaxBufferedDocs(mxbf);
iw.setMaxFieldLength(mxfl);
if (flushAtRAMUsage > 0)
iw.setRAMBufferSizeMB(flushAtRAMUsage);
else if (mxbf != 0)
iw.setMaxBufferedDocs(mxbf);
else
throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
getRunData().setIndexWriter(iw);
return 1;
}

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import java.io.IOException;
@ -35,10 +36,10 @@ import java.io.IOException;
*/
public class OpenIndexTask extends PerfTask {
public static final int DEFAULT_MAX_BUFFERED = 10;
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
public static final int DEFAULT_MERGE_PFACTOR = 10;
public static final int DEFAULT_RAM_FLUSH_MB = 0;
public static final int DEFAULT_MAX_BUFFERED = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS;
public static final int DEFAULT_MAX_FIELD_LENGTH = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
public static final int DEFAULT_MERGE_PFACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
public static final double DEFAULT_RAM_FLUSH_MB = (int) IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB;
public static final boolean DEFAULT_AUTO_COMMIT = true;
public OpenIndexTask(PerfRunData runData) {
@ -55,12 +56,17 @@ public class OpenIndexTask extends PerfTask {
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
double flushAtRAMUsage = config.get("ram.flush.mb", DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", DEFAULT_AUTO_COMMIT);
IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
// must update params for newly opened writer
writer.setMaxBufferedDocs(mxbf);
if (flushAtRAMUsage > 0)
writer.setRAMBufferSizeMB(flushAtRAMUsage);
else if (mxbf != 0)
writer.setMaxBufferedDocs(mxbf);
else
throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?

View File

@ -175,6 +175,36 @@ public class Config {
return vals[roundNumber % vals.length];
}
/**
* Return a double property.
* If the property contain ":", e.g. "10:100:5", it is interpreted
* as array of doubles. It is extracted once, on first call
* to get() it, and a by-round-value is returned.
* @param name name of property
* @param dflt default value
* @return a double property.
*/
public double get (String name, double dflt) {
// use value by round if already parsed
double vals[] = (double[]) valByRound.get(name);
if (vals != null) {
return vals[roundNumber % vals.length];
}
// done if not by round
String sval = props.getProperty(name,""+dflt);
if (sval.indexOf(":")<0) {
return Double.parseDouble(sval);
}
// first time this prop is extracted by round
int k = sval.indexOf(":");
String colName = sval.substring(0,k);
sval = sval.substring(k+1);
colForValByRound.put(name,colName);
vals = propToDoubleArray(sval);
valByRound.put(name,vals);
return vals[roundNumber % vals.length];
}
/**
* Return a boolean property.
* If the property contain ":", e.g. "true.true.false", it is interpreted
@ -241,7 +271,7 @@ public class Config {
return roundNumber;
}
// extract properties to array, e.g. for "10.100.5" return int[]{10,100,5}.
// extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}.
private int[] propToIntArray (String s) {
if (s.indexOf(":")<0) {
return new int [] { Integer.parseInt(s) };
@ -260,7 +290,26 @@ public class Config {
return res;
}
// extract properties to array, e.g. for "true.true.false" return booleab[]{true,false,false}.
// extract properties to array, e.g. for "10.7:100.4:-2.3" return int[]{10.7,100.4,-2.3}.
private double[] propToDoubleArray (String s) {
if (s.indexOf(":")<0) {
return new double [] { Double.parseDouble(s) };
}
ArrayList a = new ArrayList();
StringTokenizer st = new StringTokenizer(s,":");
while (st.hasMoreTokens()) {
String t = st.nextToken();
a.add(new Double(t));
}
double res[] = new double[a.size()];
for (int i=0; i<a.size(); i++) {
res[i] = ((Double) a.get(i)).doubleValue();
}
return res;
}
// extract properties to array, e.g. for "true:true:false" return boolean[]{true,false,false}.
private boolean[] propToBooleanArray (String s) {
if (s.indexOf(":")<0) {
return new boolean [] { Boolean.valueOf(s).booleanValue() };

View File

@ -24,6 +24,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.gdata.search.config.IndexSchema;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.store.Directory;
/**
@ -42,7 +43,7 @@ public class GDataIndexWriter extends IndexWriter {
setUseCompoundFile(config.isUseCompoundFile());
if (config.getMaxBufferedDocs() != IndexSchema.NOT_SET_VALUE)
setMaxBufferedDocs(config.getMaxBufferedDocs());
if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE)
if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE && getMergePolicy() instanceof LogDocMergePolicy)
setMaxMergeDocs(config.getMaxMergeDocs());
if (config.getMergeFactor() != IndexSchema.NOT_SET_VALUE)
setMergeFactor(config.getMergeFactor());

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.gdata.search.config.IndexSchema;
import org.apache.lucene.gdata.search.config.IndexSchemaField;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.LogDocMergePolicy;
/**
*
@ -72,7 +73,8 @@ public class TestGdataIndexWriter extends TestCase {
// assertEquals(VALUE_GT_DEFAULT_LONG,writer.getCommitLockTimeout());
assertEquals(VALUE_GT_DEFAULT_LONG,writer.getWriteLockTimeout());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxBufferedDocs());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs());
if (writer.getMergePolicy() instanceof LogDocMergePolicy)
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxFieldLength());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMergeFactor());
assertTrue(writer.getUseCompoundFile());

View File

@ -71,16 +71,16 @@ import java.util.Map.Entry;
or enough added documents since the last flush, whichever
is sooner. For the added documents, flushing is triggered
either by RAM usage of the documents (see {@link
#setRAMBufferSizeMB}) or the number of added documents
(this is the default; see {@link #setMaxBufferedDocs}).
For best indexing speed you should flush by RAM usage with
a large RAM buffer. You can also force a flush by calling
#setRAMBufferSizeMB}) or the number of added documents.
The default is to flush when RAM usage hits 16 MB. For
best indexing speed you should flush by RAM usage with a
large RAM buffer. You can also force a flush by calling
{@link #flush}. When a flush occurs, both pending deletes
and added documents are flushed to the index. A flush may
also trigger one or more segment merges which by default
run (blocking) with the current thread (see <a
href="#mergePolicy">below</a> for changing the {@link
MergeScheduler}).</p>
run with a background thread so as not to block the
addDocument calls (see <a href="#mergePolicy">below</a>
for changing the {@link MergeScheduler}).</p>
<a name="autoCommit"></a>
<p>The optional <code>autoCommit</code> argument to the
@ -153,10 +153,10 @@ import java.util.Map.Entry;
select which merges to do, if any, and return a {@link
MergePolicy.MergeSpecification} describing the merges. It
also selects merges to do for optimize(). (The default is
{@link LogDocMergePolicy}. Then, the {@link
{@link LogByteMergePolicy}. Then, the {@link
MergeScheduler} is invoked with the requested merges and
it decides when and how to run the merges. The default is
{@link SerialMergeScheduler}. </p>
{@link ConcurrentMergeScheduler}. </p>
*/
/*
@ -205,22 +205,16 @@ public class IndexWriter {
public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
/**
* Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
* Default value is 0 (because IndexWriter flushes by RAM
* usage by default). Change using {@link #setMaxBufferedDocs(int)}.
*/
public final static int DEFAULT_MAX_BUFFERED_DOCS = 10;
/* new merge policy
public final static int DEFAULT_MAX_BUFFERED_DOCS = 0;
*/
/**
* Default value is 0 MB (which means flush only by doc
* count). Change using {@link #setRAMBufferSizeMB}.
* Default value is 16 MB (which means flush when buffered
* docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}.
*/
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0;
/* new merge policy
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
*/
/**
* Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}.
@ -281,8 +275,8 @@ public class IndexWriter {
// merges
private HashSet mergingSegments = new HashSet();
private MergePolicy mergePolicy = new LogDocMergePolicy();
private MergeScheduler mergeScheduler = new SerialMergeScheduler();
private MergePolicy mergePolicy = new LogByteSizeMergePolicy();
private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler();
private LinkedList pendingMerges = new LinkedList();
private Set runningMerges = new HashSet();
private List mergeExceptions = new ArrayList();
@ -1136,6 +1130,9 @@ public class IndexWriter {
rollbackSegmentInfos = null;
}
if (infoStream != null)
message("at close: " + segString());
if (writeLock != null) {
writeLock.release(); // release write lock
writeLock = null;
@ -2252,7 +2249,7 @@ public class IndexWriter {
// apply to more than just the last flushed segment
boolean flushDeletes = docWriter.hasDeletes();
if (infoStream != null)
if (infoStream != null) {
message(" flush: segment=" + docWriter.getSegment() +
" docStoreSegment=" + docWriter.getDocStoreSegment() +
" docStoreOffset=" + docWriter.getDocStoreOffset() +
@ -2261,6 +2258,8 @@ public class IndexWriter {
" flushDocStores=" + flushDocStores +
" numDocs=" + numDocs +
" numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms());
message(" index before flush " + segString());
}
int docStoreOffset = docWriter.getDocStoreOffset();
boolean docStoreIsCompoundFile = false;

View File

@ -22,7 +22,7 @@ import java.io.IOException;
/** Expert: {@link IndexWriter} uses an instance
* implementing this interface to execute the merges
* selected by a {@link MergePolicy}. The default
* MergeScheduler is {@link SerialMergeScheduler}. */
* MergeScheduler is {@link ConcurrentMergeScheduler}. */
public interface MergeScheduler {

View File

@ -326,7 +326,9 @@ public class TestAddIndexesNoOptimize extends TestCase {
private IndexWriter newWriter(Directory dir, boolean create)
throws IOException {
return new IndexWriter(dir, new WhitespaceAnalyzer(), create);
final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), create);
writer.setMergePolicy(new LogDocMergePolicy());
return writer;
}
private void addDocs(IndexWriter writer, int numDocs) throws IOException {

View File

@ -127,6 +127,7 @@ public class TestAtomicUpdate extends TestCase {
d.add(new Field("contents", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(d);
}
writer.flush();
IndexerThread indexerThread = new IndexerThread(writer, threads);
threads[0] = indexerThread;

View File

@ -113,10 +113,13 @@ public class TestConcurrentMergeScheduler extends TestCase {
ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
writer.setMergeScheduler(cms);
LogDocMergePolicy mp = new LogDocMergePolicy();
writer.setMergePolicy(mp);
// Force degenerate merging so we can get a mix of
// merging of segments with and without deletes at the
// start:
((LogDocMergePolicy) writer.getMergePolicy()).setMinMergeDocs(1000);
mp.setMinMergeDocs(1000);
Document doc = new Document();
Field idField = new Field("id", "", Field.Store.YES, Field.Index.UN_TOKENIZED);

View File

@ -75,10 +75,7 @@ public class TestIndexModifier extends TestCase {
// Lucene defaults:
assertNull(i.getInfoStream());
assertTrue(i.getUseCompoundFile());
/* new merge policy
assertEquals(0, i.getMaxBufferedDocs());
*/
assertEquals(10, i.getMaxBufferedDocs());
assertEquals(10000, i.getMaxFieldLength());
assertEquals(10, i.getMergeFactor());
// test setting properties:

View File

@ -37,6 +37,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 100; i++) {
addDoc(writer);
@ -53,6 +54,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
writer.setMergePolicy(new LogDocMergePolicy());
boolean noOverMerge = false;
for (int i = 0; i < 100; i++) {
@ -74,19 +76,18 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
MergePolicy mp = writer.getMergePolicy();
if (mp instanceof LogDocMergePolicy)
((LogDocMergePolicy) mp).setMinMergeDocs(100);
LogDocMergePolicy mp = new LogDocMergePolicy();
mp.setMinMergeDocs(100);
writer.setMergePolicy(mp);
for (int i = 0; i < 100; i++) {
addDoc(writer);
writer.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
mp = writer.getMergePolicy();
writer.setMaxBufferedDocs(10);
if (mp instanceof LogDocMergePolicy)
((LogDocMergePolicy) mp).setMinMergeDocs(100);
writer.setMergePolicy(mp);
mp.setMinMergeDocs(100);
writer.setMergeFactor(10);
checkInvariants(writer);
}
@ -101,6 +102,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 250; i++) {
addDoc(writer);
@ -126,6 +128,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
writer.setMergePolicy(new LogDocMergePolicy());
// leftmost* segment has 1 doc
// rightmost* segment has 100 docs
@ -139,6 +142,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
writer.setMergePolicy(new LogDocMergePolicy());
}
writer.setMaxBufferedDocs(10);
@ -164,6 +168,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
@ -178,6 +183,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
reader.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(5);