diff --git a/CHANGES.txt b/CHANGES.txt index 64cb170b345..82e5ed5c559 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -6,6 +6,25 @@ $Id$ Changes in runtime behavior + 1. LUCENE-994: Defaults for IndexWriter have been changed to maximize + out-of-the-box indexing speed. First, IndexWriter now flushes by + RAM usage (16 MB by default) instead of a fixed doc count (call + IndexWriter.setMaxBufferedDocs to get backwards compatible + behavior). Second, ConcurrentMergeScheduler is used to run merges + using background threads (call IndexWriter.setMergeScheduler(new + SerialMergeScheduler()) to get backwards compatible behavior). + Third, merges are chosen based on size in bytes of each segment + rather than document count of each segment (call + IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get + backwards compatible behavior). + + NOTE: users of ParallelReader must change back all of these + defaults in order to ensure the docIDs "align" across all parallel + indices. + + (Mike McCandless) + + API Changes 1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 4c3cd48fccd..eb72ff4984b 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -49,18 +49,20 @@ public class CreateIndexTask extends PerfTask { int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH); - double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB); + double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB); boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true); iw.setUseCompoundFile(cmpnd); iw.setMergeFactor(mrgf); - iw.setMaxBufferedDocs(mxbf); iw.setMaxFieldLength(mxfl); if (flushAtRAMUsage > 0) iw.setRAMBufferSizeMB(flushAtRAMUsage); - + else if (mxbf != 0) + iw.setMaxBufferedDocs(mxbf); + else + throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero"); getRunData().setIndexWriter(iw); return 1; } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java index 7e1b1b2ea1b..4d4910a7d04 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.store.Directory; import java.io.IOException; @@ -35,10 +36,10 @@ import java.io.IOException; */ public class OpenIndexTask extends PerfTask { - public static final int DEFAULT_MAX_BUFFERED = 10; - public static final int DEFAULT_MAX_FIELD_LENGTH = 10000; - public static final int DEFAULT_MERGE_PFACTOR = 10; - public static final int DEFAULT_RAM_FLUSH_MB = 0; + public static final int DEFAULT_MAX_BUFFERED = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; + public static final int DEFAULT_MAX_FIELD_LENGTH = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; + public static final int DEFAULT_MERGE_PFACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; + public static final double DEFAULT_RAM_FLUSH_MB = (int) IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB; public static final boolean DEFAULT_AUTO_COMMIT = true; public OpenIndexTask(PerfRunData runData) { @@ -55,12 +56,17 @@ public class OpenIndexTask extends PerfTask { int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED); int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH); - double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB); - boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); + double flushAtRAMUsage = config.get("ram.flush.mb", DEFAULT_RAM_FLUSH_MB); + boolean autoCommit = config.get("autocommit", DEFAULT_AUTO_COMMIT); IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false); // must update params for newly opened writer - writer.setMaxBufferedDocs(mxbf); + if (flushAtRAMUsage > 0) + writer.setRAMBufferSizeMB(flushAtRAMUsage); + else if (mxbf != 0) + writer.setMaxBufferedDocs(mxbf); + else + throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero"); writer.setMaxFieldLength(mxfl); writer.setMergeFactor(mrgf); writer.setUseCompoundFile(cmpnd); // this one redundant? diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java index 798786b53b9..22ab53b91a4 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java @@ -175,6 +175,36 @@ public class Config { return vals[roundNumber % vals.length]; } + /** + * Return a double property. + * If the property contain ":", e.g. "10:100:5", it is interpreted + * as array of doubles. It is extracted once, on first call + * to get() it, and a by-round-value is returned. + * @param name name of property + * @param dflt default value + * @return a double property. + */ + public double get (String name, double dflt) { + // use value by round if already parsed + double vals[] = (double[]) valByRound.get(name); + if (vals != null) { + return vals[roundNumber % vals.length]; + } + // done if not by round + String sval = props.getProperty(name,""+dflt); + if (sval.indexOf(":")<0) { + return Double.parseDouble(sval); + } + // first time this prop is extracted by round + int k = sval.indexOf(":"); + String colName = sval.substring(0,k); + sval = sval.substring(k+1); + colForValByRound.put(name,colName); + vals = propToDoubleArray(sval); + valByRound.put(name,vals); + return vals[roundNumber % vals.length]; + } + /** * Return a boolean property. * If the property contain ":", e.g. "true.true.false", it is interpreted @@ -241,7 +271,7 @@ public class Config { return roundNumber; } - // extract properties to array, e.g. for "10.100.5" return int[]{10,100,5}. + // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}. private int[] propToIntArray (String s) { if (s.indexOf(":")<0) { return new int [] { Integer.parseInt(s) }; @@ -260,7 +290,26 @@ public class Config { return res; } - // extract properties to array, e.g. for "true.true.false" return booleab[]{true,false,false}. + // extract properties to array, e.g. for "10.7:100.4:-2.3" return int[]{10.7,100.4,-2.3}. + private double[] propToDoubleArray (String s) { + if (s.indexOf(":")<0) { + return new double [] { Double.parseDouble(s) }; + } + + ArrayList a = new ArrayList(); + StringTokenizer st = new StringTokenizer(s,":"); + while (st.hasMoreTokens()) { + String t = st.nextToken(); + a.add(new Double(t)); + } + double res[] = new double[a.size()]; + for (int i=0; ibelow for changing the {@link - MergeScheduler}).

+ run with a background thread so as not to block the + addDocument calls (see below + for changing the {@link MergeScheduler}).

The optional autoCommit argument to the @@ -153,10 +153,10 @@ import java.util.Map.Entry; select which merges to do, if any, and return a {@link MergePolicy.MergeSpecification} describing the merges. It also selects merges to do for optimize(). (The default is - {@link LogDocMergePolicy}. Then, the {@link + {@link LogByteMergePolicy}. Then, the {@link MergeScheduler} is invoked with the requested merges and it decides when and how to run the merges. The default is - {@link SerialMergeScheduler}.

+ {@link ConcurrentMergeScheduler}.

*/ /* @@ -205,22 +205,16 @@ public class IndexWriter { public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; /** - * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}. + * Default value is 0 (because IndexWriter flushes by RAM + * usage by default). Change using {@link #setMaxBufferedDocs(int)}. */ - - public final static int DEFAULT_MAX_BUFFERED_DOCS = 10; - /* new merge policy public final static int DEFAULT_MAX_BUFFERED_DOCS = 0; - */ /** - * Default value is 0 MB (which means flush only by doc - * count). Change using {@link #setRAMBufferSizeMB}. + * Default value is 16 MB (which means flush when buffered + * docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}. */ - public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0; - /* new merge policy public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; - */ /** * Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}. @@ -281,8 +275,8 @@ public class IndexWriter { // merges private HashSet mergingSegments = new HashSet(); - private MergePolicy mergePolicy = new LogDocMergePolicy(); - private MergeScheduler mergeScheduler = new SerialMergeScheduler(); + private MergePolicy mergePolicy = new LogByteSizeMergePolicy(); + private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); private LinkedList pendingMerges = new LinkedList(); private Set runningMerges = new HashSet(); private List mergeExceptions = new ArrayList(); @@ -1136,6 +1130,9 @@ public class IndexWriter { rollbackSegmentInfos = null; } + if (infoStream != null) + message("at close: " + segString()); + if (writeLock != null) { writeLock.release(); // release write lock writeLock = null; @@ -2252,7 +2249,7 @@ public class IndexWriter { // apply to more than just the last flushed segment boolean flushDeletes = docWriter.hasDeletes(); - if (infoStream != null) + if (infoStream != null) { message(" flush: segment=" + docWriter.getSegment() + " docStoreSegment=" + docWriter.getDocStoreSegment() + " docStoreOffset=" + docWriter.getDocStoreOffset() + @@ -2261,6 +2258,8 @@ public class IndexWriter { " flushDocStores=" + flushDocStores + " numDocs=" + numDocs + " numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms()); + message(" index before flush " + segString()); + } int docStoreOffset = docWriter.getDocStoreOffset(); boolean docStoreIsCompoundFile = false; diff --git a/src/java/org/apache/lucene/index/MergeScheduler.java b/src/java/org/apache/lucene/index/MergeScheduler.java index 244af432d15..336f9a95092 100644 --- a/src/java/org/apache/lucene/index/MergeScheduler.java +++ b/src/java/org/apache/lucene/index/MergeScheduler.java @@ -22,7 +22,7 @@ import java.io.IOException; /** Expert: {@link IndexWriter} uses an instance * implementing this interface to execute the merges * selected by a {@link MergePolicy}. The default - * MergeScheduler is {@link SerialMergeScheduler}. */ + * MergeScheduler is {@link ConcurrentMergeScheduler}. */ public interface MergeScheduler { diff --git a/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java b/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java index 53d94da52b2..38a031865c2 100755 --- a/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java +++ b/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java @@ -326,7 +326,9 @@ public class TestAddIndexesNoOptimize extends TestCase { private IndexWriter newWriter(Directory dir, boolean create) throws IOException { - return new IndexWriter(dir, new WhitespaceAnalyzer(), create); + final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), create); + writer.setMergePolicy(new LogDocMergePolicy()); + return writer; } private void addDocs(IndexWriter writer, int numDocs) throws IOException { diff --git a/src/test/org/apache/lucene/index/TestAtomicUpdate.java b/src/test/org/apache/lucene/index/TestAtomicUpdate.java index a1de0febd2a..c2a1d9b6107 100644 --- a/src/test/org/apache/lucene/index/TestAtomicUpdate.java +++ b/src/test/org/apache/lucene/index/TestAtomicUpdate.java @@ -127,6 +127,7 @@ public class TestAtomicUpdate extends TestCase { d.add(new Field("contents", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(d); } + writer.flush(); IndexerThread indexerThread = new IndexerThread(writer, threads); threads[0] = indexerThread; diff --git a/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java index 352c88d467d..1c8c0721ee4 100644 --- a/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java +++ b/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java @@ -113,10 +113,13 @@ public class TestConcurrentMergeScheduler extends TestCase { ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler(); writer.setMergeScheduler(cms); + LogDocMergePolicy mp = new LogDocMergePolicy(); + writer.setMergePolicy(mp); + // Force degenerate merging so we can get a mix of // merging of segments with and without deletes at the // start: - ((LogDocMergePolicy) writer.getMergePolicy()).setMinMergeDocs(1000); + mp.setMinMergeDocs(1000); Document doc = new Document(); Field idField = new Field("id", "", Field.Store.YES, Field.Index.UN_TOKENIZED); diff --git a/src/test/org/apache/lucene/index/TestIndexModifier.java b/src/test/org/apache/lucene/index/TestIndexModifier.java index ed87bde1036..cc61c9e0dd0 100644 --- a/src/test/org/apache/lucene/index/TestIndexModifier.java +++ b/src/test/org/apache/lucene/index/TestIndexModifier.java @@ -75,10 +75,7 @@ public class TestIndexModifier extends TestCase { // Lucene defaults: assertNull(i.getInfoStream()); assertTrue(i.getUseCompoundFile()); - /* new merge policy assertEquals(0, i.getMaxBufferedDocs()); - */ - assertEquals(10, i.getMaxBufferedDocs()); assertEquals(10000, i.getMaxFieldLength()); assertEquals(10, i.getMergeFactor()); // test setting properties: diff --git a/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java b/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java index 74e9099a2be..10c35ac2192 100755 --- a/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java +++ b/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java @@ -37,6 +37,7 @@ public class TestIndexWriterMergePolicy extends TestCase { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); + writer.setMergePolicy(new LogDocMergePolicy()); for (int i = 0; i < 100; i++) { addDoc(writer); @@ -53,6 +54,7 @@ public class TestIndexWriterMergePolicy extends TestCase { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); + writer.setMergePolicy(new LogDocMergePolicy()); boolean noOverMerge = false; for (int i = 0; i < 100; i++) { @@ -74,19 +76,18 @@ public class TestIndexWriterMergePolicy extends TestCase { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); - MergePolicy mp = writer.getMergePolicy(); - if (mp instanceof LogDocMergePolicy) - ((LogDocMergePolicy) mp).setMinMergeDocs(100); + LogDocMergePolicy mp = new LogDocMergePolicy(); + mp.setMinMergeDocs(100); + writer.setMergePolicy(mp); for (int i = 0; i < 100; i++) { addDoc(writer); writer.close(); writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); - mp = writer.getMergePolicy(); writer.setMaxBufferedDocs(10); - if (mp instanceof LogDocMergePolicy) - ((LogDocMergePolicy) mp).setMinMergeDocs(100); + writer.setMergePolicy(mp); + mp.setMinMergeDocs(100); writer.setMergeFactor(10); checkInvariants(writer); } @@ -101,6 +102,7 @@ public class TestIndexWriterMergePolicy extends TestCase { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(100); + writer.setMergePolicy(new LogDocMergePolicy()); for (int i = 0; i < 250; i++) { addDoc(writer); @@ -126,6 +128,7 @@ public class TestIndexWriterMergePolicy extends TestCase { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(101); writer.setMergeFactor(101); + writer.setMergePolicy(new LogDocMergePolicy()); // leftmost* segment has 1 doc // rightmost* segment has 100 docs @@ -139,6 +142,7 @@ public class TestIndexWriterMergePolicy extends TestCase { writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); writer.setMaxBufferedDocs(101); writer.setMergeFactor(101); + writer.setMergePolicy(new LogDocMergePolicy()); } writer.setMaxBufferedDocs(10); @@ -164,6 +168,7 @@ public class TestIndexWriterMergePolicy extends TestCase { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); + writer.setMergePolicy(new LogDocMergePolicy()); writer.setMaxBufferedDocs(10); writer.setMergeFactor(100); @@ -178,6 +183,7 @@ public class TestIndexWriterMergePolicy extends TestCase { reader.close(); writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); + writer.setMergePolicy(new LogDocMergePolicy()); writer.setMaxBufferedDocs(10); writer.setMergeFactor(5);