diff --git a/CHANGES.txt b/CHANGES.txt
index 64cb170b345..82e5ed5c559 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,25 @@ $Id$
Changes in runtime behavior
+ 1. LUCENE-994: Defaults for IndexWriter have been changed to maximize
+ out-of-the-box indexing speed. First, IndexWriter now flushes by
+ RAM usage (16 MB by default) instead of a fixed doc count (call
+ IndexWriter.setMaxBufferedDocs to get backwards compatible
+ behavior). Second, ConcurrentMergeScheduler is used to run merges
+ using background threads (call IndexWriter.setMergeScheduler(new
+ SerialMergeScheduler()) to get backwards compatible behavior).
+ Third, merges are chosen based on size in bytes of each segment
+ rather than document count of each segment (call
+ IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get
+ backwards compatible behavior).
+
+ NOTE: users of ParallelReader must change back all of these
+ defaults in order to ensure the docIDs "align" across all parallel
+ indices.
+
+ (Mike McCandless)
+
+
API Changes
1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
index 4c3cd48fccd..eb72ff4984b 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@@ -49,18 +49,20 @@ public class CreateIndexTask extends PerfTask {
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
- double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+ double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
- iw.setMaxBufferedDocs(mxbf);
iw.setMaxFieldLength(mxfl);
if (flushAtRAMUsage > 0)
iw.setRAMBufferSizeMB(flushAtRAMUsage);
-
+ else if (mxbf != 0)
+ iw.setMaxBufferedDocs(mxbf);
+ else
+ throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
getRunData().setIndexWriter(iw);
return 1;
}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
index 7e1b1b2ea1b..4d4910a7d04 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
@@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import java.io.IOException;
@@ -35,10 +36,10 @@ import java.io.IOException;
*/
public class OpenIndexTask extends PerfTask {
- public static final int DEFAULT_MAX_BUFFERED = 10;
- public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
- public static final int DEFAULT_MERGE_PFACTOR = 10;
- public static final int DEFAULT_RAM_FLUSH_MB = 0;
+ public static final int DEFAULT_MAX_BUFFERED = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS;
+ public static final int DEFAULT_MAX_FIELD_LENGTH = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
+ public static final int DEFAULT_MERGE_PFACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
+ public static final double DEFAULT_RAM_FLUSH_MB = (int) IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB;
public static final boolean DEFAULT_AUTO_COMMIT = true;
public OpenIndexTask(PerfRunData runData) {
@@ -55,12 +56,17 @@ public class OpenIndexTask extends PerfTask {
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
- double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
- boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
+ double flushAtRAMUsage = config.get("ram.flush.mb", DEFAULT_RAM_FLUSH_MB);
+ boolean autoCommit = config.get("autocommit", DEFAULT_AUTO_COMMIT);
IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
// must update params for newly opened writer
- writer.setMaxBufferedDocs(mxbf);
+ if (flushAtRAMUsage > 0)
+ writer.setRAMBufferSizeMB(flushAtRAMUsage);
+ else if (mxbf != 0)
+ writer.setMaxBufferedDocs(mxbf);
+ else
+ throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
index 798786b53b9..22ab53b91a4 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
@@ -175,6 +175,36 @@ public class Config {
return vals[roundNumber % vals.length];
}
+ /**
+ * Return a double property.
+ * If the property contain ":", e.g. "10:100:5", it is interpreted
+ * as array of doubles. It is extracted once, on first call
+ * to get() it, and a by-round-value is returned.
+ * @param name name of property
+ * @param dflt default value
+ * @return a double property.
+ */
+ public double get (String name, double dflt) {
+ // use value by round if already parsed
+ double vals[] = (double[]) valByRound.get(name);
+ if (vals != null) {
+ return vals[roundNumber % vals.length];
+ }
+ // done if not by round
+ String sval = props.getProperty(name,""+dflt);
+ if (sval.indexOf(":")<0) {
+ return Double.parseDouble(sval);
+ }
+ // first time this prop is extracted by round
+ int k = sval.indexOf(":");
+ String colName = sval.substring(0,k);
+ sval = sval.substring(k+1);
+ colForValByRound.put(name,colName);
+ vals = propToDoubleArray(sval);
+ valByRound.put(name,vals);
+ return vals[roundNumber % vals.length];
+ }
+
/**
* Return a boolean property.
* If the property contain ":", e.g. "true.true.false", it is interpreted
@@ -241,7 +271,7 @@ public class Config {
return roundNumber;
}
- // extract properties to array, e.g. for "10.100.5" return int[]{10,100,5}.
+ // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}.
private int[] propToIntArray (String s) {
if (s.indexOf(":")<0) {
return new int [] { Integer.parseInt(s) };
@@ -260,7 +290,26 @@ public class Config {
return res;
}
- // extract properties to array, e.g. for "true.true.false" return booleab[]{true,false,false}.
+ // extract properties to array, e.g. for "10.7:100.4:-2.3" return int[]{10.7,100.4,-2.3}.
+ private double[] propToDoubleArray (String s) {
+ if (s.indexOf(":")<0) {
+ return new double [] { Double.parseDouble(s) };
+ }
+
+ ArrayList a = new ArrayList();
+ StringTokenizer st = new StringTokenizer(s,":");
+ while (st.hasMoreTokens()) {
+ String t = st.nextToken();
+ a.add(new Double(t));
+ }
+ double res[] = new double[a.size()];
+ for (int i=0; ibelow for changing the {@link
- MergeScheduler}).
+ run with a background thread so as not to block the
+ addDocument calls (see below
+ for changing the {@link MergeScheduler}).
The optional autoCommit
argument to the
@@ -153,10 +153,10 @@ import java.util.Map.Entry;
select which merges to do, if any, and return a {@link
MergePolicy.MergeSpecification} describing the merges. It
also selects merges to do for optimize(). (The default is
- {@link LogDocMergePolicy}. Then, the {@link
+ {@link LogByteMergePolicy}. Then, the {@link
MergeScheduler} is invoked with the requested merges and
it decides when and how to run the merges. The default is
- {@link SerialMergeScheduler}.
+ {@link ConcurrentMergeScheduler}.
*/
/*
@@ -205,22 +205,16 @@ public class IndexWriter {
public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
/**
- * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
+ * Default value is 0 (because IndexWriter flushes by RAM
+ * usage by default). Change using {@link #setMaxBufferedDocs(int)}.
*/
-
- public final static int DEFAULT_MAX_BUFFERED_DOCS = 10;
- /* new merge policy
public final static int DEFAULT_MAX_BUFFERED_DOCS = 0;
- */
/**
- * Default value is 0 MB (which means flush only by doc
- * count). Change using {@link #setRAMBufferSizeMB}.
+ * Default value is 16 MB (which means flush when buffered
+ * docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}.
*/
- public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0;
- /* new merge policy
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
- */
/**
* Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}.
@@ -281,8 +275,8 @@ public class IndexWriter {
// merges
private HashSet mergingSegments = new HashSet();
- private MergePolicy mergePolicy = new LogDocMergePolicy();
- private MergeScheduler mergeScheduler = new SerialMergeScheduler();
+ private MergePolicy mergePolicy = new LogByteSizeMergePolicy();
+ private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler();
private LinkedList pendingMerges = new LinkedList();
private Set runningMerges = new HashSet();
private List mergeExceptions = new ArrayList();
@@ -1136,6 +1130,9 @@ public class IndexWriter {
rollbackSegmentInfos = null;
}
+ if (infoStream != null)
+ message("at close: " + segString());
+
if (writeLock != null) {
writeLock.release(); // release write lock
writeLock = null;
@@ -2252,7 +2249,7 @@ public class IndexWriter {
// apply to more than just the last flushed segment
boolean flushDeletes = docWriter.hasDeletes();
- if (infoStream != null)
+ if (infoStream != null) {
message(" flush: segment=" + docWriter.getSegment() +
" docStoreSegment=" + docWriter.getDocStoreSegment() +
" docStoreOffset=" + docWriter.getDocStoreOffset() +
@@ -2261,6 +2258,8 @@ public class IndexWriter {
" flushDocStores=" + flushDocStores +
" numDocs=" + numDocs +
" numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms());
+ message(" index before flush " + segString());
+ }
int docStoreOffset = docWriter.getDocStoreOffset();
boolean docStoreIsCompoundFile = false;
diff --git a/src/java/org/apache/lucene/index/MergeScheduler.java b/src/java/org/apache/lucene/index/MergeScheduler.java
index 244af432d15..336f9a95092 100644
--- a/src/java/org/apache/lucene/index/MergeScheduler.java
+++ b/src/java/org/apache/lucene/index/MergeScheduler.java
@@ -22,7 +22,7 @@ import java.io.IOException;
/** Expert: {@link IndexWriter} uses an instance
* implementing this interface to execute the merges
* selected by a {@link MergePolicy}. The default
- * MergeScheduler is {@link SerialMergeScheduler}. */
+ * MergeScheduler is {@link ConcurrentMergeScheduler}. */
public interface MergeScheduler {
diff --git a/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java b/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java
index 53d94da52b2..38a031865c2 100755
--- a/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java
+++ b/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java
@@ -326,7 +326,9 @@ public class TestAddIndexesNoOptimize extends TestCase {
private IndexWriter newWriter(Directory dir, boolean create)
throws IOException {
- return new IndexWriter(dir, new WhitespaceAnalyzer(), create);
+ final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), create);
+ writer.setMergePolicy(new LogDocMergePolicy());
+ return writer;
}
private void addDocs(IndexWriter writer, int numDocs) throws IOException {
diff --git a/src/test/org/apache/lucene/index/TestAtomicUpdate.java b/src/test/org/apache/lucene/index/TestAtomicUpdate.java
index a1de0febd2a..c2a1d9b6107 100644
--- a/src/test/org/apache/lucene/index/TestAtomicUpdate.java
+++ b/src/test/org/apache/lucene/index/TestAtomicUpdate.java
@@ -127,6 +127,7 @@ public class TestAtomicUpdate extends TestCase {
d.add(new Field("contents", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(d);
}
+ writer.flush();
IndexerThread indexerThread = new IndexerThread(writer, threads);
threads[0] = indexerThread;
diff --git a/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
index 352c88d467d..1c8c0721ee4 100644
--- a/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
+++ b/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
@@ -113,10 +113,13 @@ public class TestConcurrentMergeScheduler extends TestCase {
ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
writer.setMergeScheduler(cms);
+ LogDocMergePolicy mp = new LogDocMergePolicy();
+ writer.setMergePolicy(mp);
+
// Force degenerate merging so we can get a mix of
// merging of segments with and without deletes at the
// start:
- ((LogDocMergePolicy) writer.getMergePolicy()).setMinMergeDocs(1000);
+ mp.setMinMergeDocs(1000);
Document doc = new Document();
Field idField = new Field("id", "", Field.Store.YES, Field.Index.UN_TOKENIZED);
diff --git a/src/test/org/apache/lucene/index/TestIndexModifier.java b/src/test/org/apache/lucene/index/TestIndexModifier.java
index ed87bde1036..cc61c9e0dd0 100644
--- a/src/test/org/apache/lucene/index/TestIndexModifier.java
+++ b/src/test/org/apache/lucene/index/TestIndexModifier.java
@@ -75,10 +75,7 @@ public class TestIndexModifier extends TestCase {
// Lucene defaults:
assertNull(i.getInfoStream());
assertTrue(i.getUseCompoundFile());
- /* new merge policy
assertEquals(0, i.getMaxBufferedDocs());
- */
- assertEquals(10, i.getMaxBufferedDocs());
assertEquals(10000, i.getMaxFieldLength());
assertEquals(10, i.getMergeFactor());
// test setting properties:
diff --git a/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java b/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
index 74e9099a2be..10c35ac2192 100755
--- a/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
@@ -37,6 +37,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
+ writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 100; i++) {
addDoc(writer);
@@ -53,6 +54,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
+ writer.setMergePolicy(new LogDocMergePolicy());
boolean noOverMerge = false;
for (int i = 0; i < 100; i++) {
@@ -74,19 +76,18 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
- MergePolicy mp = writer.getMergePolicy();
- if (mp instanceof LogDocMergePolicy)
- ((LogDocMergePolicy) mp).setMinMergeDocs(100);
+ LogDocMergePolicy mp = new LogDocMergePolicy();
+ mp.setMinMergeDocs(100);
+ writer.setMergePolicy(mp);
for (int i = 0; i < 100; i++) {
addDoc(writer);
writer.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
- mp = writer.getMergePolicy();
writer.setMaxBufferedDocs(10);
- if (mp instanceof LogDocMergePolicy)
- ((LogDocMergePolicy) mp).setMinMergeDocs(100);
+ writer.setMergePolicy(mp);
+ mp.setMinMergeDocs(100);
writer.setMergeFactor(10);
checkInvariants(writer);
}
@@ -101,6 +102,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
+ writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 250; i++) {
addDoc(writer);
@@ -126,6 +128,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
+ writer.setMergePolicy(new LogDocMergePolicy());
// leftmost* segment has 1 doc
// rightmost* segment has 100 docs
@@ -139,6 +142,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
+ writer.setMergePolicy(new LogDocMergePolicy());
}
writer.setMaxBufferedDocs(10);
@@ -164,6 +168,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+ writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
@@ -178,6 +183,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
reader.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
+ writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(5);