SOLR-342: Added support for Lucene's new index writer options

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@634016 13f79535-47bb-0310-9956-ffa450edef68
2008-03-05 20:23:36 +00:00 · 2008-03-05 20:23:36 +00:00 · 51f1319500
parent 9c3841c013
commit 51f1319500
10 changed files with 218 additions and 61 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -228,6 +228,17 @@ Optimizations

 5. SOLR-377: Speed up response writers. (yonik)

+ 6. SOLR-342:  Added support into the SolrIndexWriter for using several new
+ features of the new LuceneIndexWriter, including: setRAMBufferSizeMB(), setMergePolicy(), setMergeScheduler.  Also, added support
+ to specify Lucene's autoCommit functionality (not to be confused with Solr's similarily named autoCommit functionality) via
+  the <luceneAutoCommit> config. item.  See the test and example solrconfig.xml <indexDefaults> section for usage.  Performance during
+  indexing should be significantly increased by moving up to 2.3 due to Lucene's new indexing capabilities.  Furthermore,
+  the setRAMBufferSizeMB makes it more logical to decide on tuning factors related to indexing.  For best performance, leave the
+  mergePolicy and mergeScheduler as the defaults and set ramBufferSizeMB instead of maxBufferedDocs.  The best value for this
+    depends on the types of documents in use.  32 should be a good starting point, but reports have shown up to 48 MB provides
+    good results. Note, it is acceptable to set both ramBufferSizeMB and maxBufferedDocs, and Lucene will flush based on whichever
+    limit is reached first. (gsingers)
+
 Bug Fixes
 1. Make TextField respect sortMissingFirst and sortMissingLast fields.
    (J.J. Larrea via yonik)
--- a/client/ruby/solr-ruby/solr/conf/solrconfig.xml
+++ b/client/ruby/solr-ruby/solr/conf/solrconfig.xml
@ -34,8 +34,8 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
@ -45,8 +45,8 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>

--- a/client/ruby/solr-ruby/test/conf/solrconfig.xml
+++ b/client/ruby/solr-ruby/test/conf/solrconfig.xml
@ -36,8 +36,8 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
@ -47,8 +47,8 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>

--- a/example/solr/conf/solrconfig.xml
+++ b/example/solr/conf/solrconfig.xml
@ -35,12 +35,53 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
+
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!--
+     If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
+
+     -->
+    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
+    <!-- Tell Lucene when to flush documents to disk.
+    Giving Lucene more memory for indexing means faster indexing at the cost of more RAM
+
+    If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
+
+    -->
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
+
+    <!--
+     Expert: Turn on Lucene's auto commit capability.
+
+     TODO: Add recommendations on why you would want to do this.
+
+     NOTE: Despite the name, this value does not have any relation to Solr's autoCommit functionality
+
+     -->
+    <!--<luceneAutoCommit>false</luceneAutoCommit>-->
+    <!--
+     Expert:
+     The Merge Policy in Lucene controls how merging is handled by Lucene.  The default in 2.3 is the LogByteSizeMergePolicy, previous
+     versions used LogDocMergePolicy.
+
+     LogByteSizeMergePolicy chooses segments to merge based on their size.  The Lucene 2.2 default, LogDocMergePolicy chose when
+     to merge based on number of documents
+
+     Other implementations of MergePolicy must have a no-argument constructor
+     -->
+    <!--<mergePolicy>org.apache.lucene.index.LogByteSizeMergePolicy</mergePolicy>-->
+
+    <!--
+     Expert:
+     The Merge Scheduler in Lucene controls how merges are performed.  The ConcurrentMergeScheduler (Lucene 2.3 default)
+      can perform merges in the background using separate threads.  The SerialMergeScheduler (Lucene 2.2 default) does not.
+     -->
+    <!--<mergeScheduler>org.apache.lucene.index.ConcurrentMergeScheduler</mergeScheduler>-->
+
    <!--
      As long as Solr is the only process modifying your index, it is
      safe to use Lucene's in process locking mechanism.  But you may
@ -60,8 +101,10 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!-- Deprecated -->
+    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>

--- a/src/java/org/apache/solr/core/Config.java
+++ b/src/java/org/apache/solr/core/Config.java
@ -241,6 +241,16 @@ public class Config {
    return val!=null ? Float.parseFloat(val) : def;
  }

+
+  public double getDouble(String path){
+     return Double.parseDouble(getVal(path, true));
+   }
+
+   public double getDouble(String path, double def) {
+     String val = getVal(path, false);
+     return val!=null ? Double.parseDouble(val) : def;
+   }
+
  // The following functions were moved to ResourceLoader
  //-----------------------------------------------------------------------------
  
--- a/src/java/org/apache/solr/update/SolrIndexConfig.java
+++ b/src/java/org/apache/solr/update/SolrIndexConfig.java
@ -18,6 +18,8 @@
 package org.apache.solr.update;

 import org.apache.solr.core.SolrConfig;
+import org.apache.lucene.index.LogByteSizeMergePolicy;
+import org.apache.lucene.index.ConcurrentMergeScheduler;

 //
 // For performance reasons, we don't want to re-read
@ -30,26 +32,38 @@ import org.apache.solr.core.SolrConfig;
 public class SolrIndexConfig {
  public static final String defaultsName ="indexDefaults";
  static final SolrIndexConfig defaultDefaults = new SolrIndexConfig();
+  public static final String DEFAULT_MERGE_POLICY_CLASSNAME = LogByteSizeMergePolicy.class.getName();
+  public static final String DEFAULT_MERGE_SCHEDULER_CLASSNAME = ConcurrentMergeScheduler.class.getName();

  private SolrIndexConfig() {
    useCompoundFile = true;
    maxBufferedDocs = -1;
    maxMergeDocs = -1;
    mergeFactor = -1;
+    ramBufferSizeMB = 16;
    maxFieldLength = -1;
    writeLockTimeout = -1;
    commitLockTimeout = -1;
    lockType = null;
+    mergePolicyClassName = DEFAULT_MERGE_POLICY_CLASSNAME;
+    mergeSchedulerClassname = DEFAULT_MERGE_SCHEDULER_CLASSNAME;
+    luceneAutoCommit = false;
  }
  
  public final boolean useCompoundFile;
  public final int maxBufferedDocs;
  public final int maxMergeDocs;
  public final int mergeFactor;
+
+  public final double ramBufferSizeMB;
+
  public final int maxFieldLength;
  public final int writeLockTimeout;
  public final int commitLockTimeout;
  public final String lockType;
+  public final String mergePolicyClassName;
+  public final String mergeSchedulerClassname;
+  public final boolean luceneAutoCommit;

  public SolrIndexConfig(SolrConfig solrConfig, String prefix, SolrIndexConfig def)  {
    if (prefix == null)
@ -60,9 +74,14 @@ public class SolrIndexConfig {
    maxBufferedDocs=solrConfig.getInt(prefix+"/maxBufferedDocs",def.maxBufferedDocs);
    maxMergeDocs=solrConfig.getInt(prefix+"/maxMergeDocs",def.maxMergeDocs);
    mergeFactor=solrConfig.getInt(prefix+"/mergeFactor",def.mergeFactor);
+    ramBufferSizeMB = solrConfig.getDouble(prefix+"ramBufferSizeMB", def.ramBufferSizeMB);
+
    maxFieldLength=solrConfig.getInt(prefix+"/maxFieldLength",def.maxFieldLength);
    writeLockTimeout=solrConfig.getInt(prefix+"/writeLockTimeout", def.writeLockTimeout);
    commitLockTimeout=solrConfig.getInt(prefix+"/commitLockTimeout", def.commitLockTimeout);
    lockType=solrConfig.get(prefix+"/lockType", def.lockType);
+    mergePolicyClassName = solrConfig.get(prefix + "/mergePolicy", def.mergePolicyClassName);
+    mergeSchedulerClassname = solrConfig.get(prefix + "/mergeScheduler", def.mergeSchedulerClassname);
+    luceneAutoCommit = solrConfig.getBool(prefix + "/luceneAutoCommit", def.luceneAutoCommit);
  }
 }
--- a/src/java/org/apache/solr/update/SolrIndexWriter.java
+++ b/src/java/org/apache/solr/update/SolrIndexWriter.java
@ -18,6 +18,9 @@
 package org.apache.solr.update;

 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.MergePolicy;
+import org.apache.lucene.index.MergeScheduler;
+import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.NativeFSLockFactory;
@ -44,7 +47,7 @@ public class SolrIndexWriter extends IndexWriter {
  String name;
  IndexSchema schema;

-  private void init(String name, IndexSchema schema, SolrIndexConfig config) {
+  private void init(String name, IndexSchema schema, SolrIndexConfig config) throws IOException {
    log.fine("Opened Writer " + name);
    this.name = name;
    this.schema = schema;
@ -53,79 +56,98 @@ public class SolrIndexWriter extends IndexWriter {

    if (config != null) {
      setUseCompoundFile(config.useCompoundFile);
-      if (config.maxBufferedDocs != -1) setMaxBufferedDocs(config.maxBufferedDocs);
+      //only set maxBufferedDocs
+      if (config.maxBufferedDocs != -1) {
+        setMaxBufferedDocs(config.maxBufferedDocs);
+      }
+      if (config.ramBufferSizeMB != -1) {
+        setRAMBufferSizeMB(config.ramBufferSizeMB);
+      }
      if (config.maxMergeDocs != -1) setMaxMergeDocs(config.maxMergeDocs);
-      if (config.mergeFactor != -1)  setMergeFactor(config.mergeFactor);
      if (config.maxFieldLength != -1) setMaxFieldLength(config.maxFieldLength);
+      if (config.mergePolicyClassName != null && SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME.equals(config.mergePolicyClassName) == false) {
+        MergePolicy policy = (MergePolicy) schema.getSolrConfig().getResourceLoader().newInstance(config.mergePolicyClassName);
+        setMergePolicy(policy);///hmm, is this really the best way to get a newInstance?
+      }
+      if (config.mergeFactor != -1 && getMergePolicy() instanceof LogMergePolicy) {
+        setMergeFactor(config.mergeFactor);
+      }
+      if (config.mergeSchedulerClassname != null && SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME.equals(config.mergeSchedulerClassname) == false) {
+        MergeScheduler scheduler = (MergeScheduler) schema.getSolrConfig().getResourceLoader().newInstance(config.mergeSchedulerClassname);
+        setMergeScheduler(scheduler);
+      }
+
      //if (config.commitLockTimeout != -1) setWriteLockTimeout(config.commitLockTimeout);
    }

  }
-  
+
  private static Directory getDirectory(String path, SolrIndexConfig config) throws IOException {
-	  Directory d = FSDirectory.getDirectory(path);
+    Directory d = FSDirectory.getDirectory(path);

    String rawLockType = (null == config) ? null : config.lockType;
    if (null == rawLockType) {
      // we default to "simple" for backwards compatiblitiy
-      log.warning("No lockType configured for "+path+" assuming 'simple'");
+      log.warning("No lockType configured for " + path + " assuming 'simple'");
      rawLockType = "simple";
    }
    final String lockType = rawLockType.toLowerCase().trim();
-    
-	  if ("simple".equals(lockType)) {
-		  d.setLockFactory(new SimpleFSLockFactory(path));
-	  } else if("native".equals(lockType)) {
-		  d.setLockFactory(new NativeFSLockFactory(path));
-	  } else if("single".equals(lockType)) {
-		  d.setLockFactory(new SingleInstanceLockFactory());
-	  } else if("none".equals(lockType)) {
-		  d.setLockFactory(new NoLockFactory());
-	  } else {
+
+    if ("simple".equals(lockType)) {
+      d.setLockFactory(new SimpleFSLockFactory(path));
+    } else if ("native".equals(lockType)) {
+      d.setLockFactory(new NativeFSLockFactory(path));
+    } else if ("single".equals(lockType)) {
+      d.setLockFactory(new SingleInstanceLockFactory());
+    } else if ("none".equals(lockType)) {
+      d.setLockFactory(new NoLockFactory());
+    } else {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
-                              "Unrecognized lockType: " + rawLockType);
-	  } 
-	  return d;
+              "Unrecognized lockType: " + rawLockType);
+    }
+    return d;
  }

  public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema) throws IOException {
-    super(getDirectory(path, null), schema.getAnalyzer(), create);
+    super(getDirectory(path, null), false, schema.getAnalyzer(), create);
    init(name, schema, null);
  }

  public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
-    super(getDirectory(path, config), schema.getAnalyzer(), create);
-    init(name, schema,config);
+    super(getDirectory(path, config), config.luceneAutoCommit, schema.getAnalyzer(), create);
+    init(name, schema, config);
  }

-  /*** use DocumentBuilder now...
-  private final void addField(Document doc, String name, String val) {
-      SchemaField ftype = schema.getField(name);
-
-      // we don't check for a null val ourselves because a solr.FieldType
-      // might actually want to map it to something.  If createField()
-      // returns null, then we don't store the field.
-
-      Field field = ftype.createField(val, boost);
-      if (field != null) doc.add(field);
-  }
-
-
-  public void addRecord(String[] fieldNames, String[] fieldValues) throws IOException {
-    Document doc = new Document();
-    for (int i=0; i<fieldNames.length; i++) {
-      String name = fieldNames[i];
-      String val = fieldNames[i];
-
-      // first null is end of list.  client can reuse arrays if they want
-      // and just write a single null if there is unused space.
-      if (name==null) break;
-
-      addField(doc,name,val);
-    }
-    addDocument(doc);
-  }
-  ******/
+  /**
+   * use DocumentBuilder now...
+   * private final void addField(Document doc, String name, String val) {
+   * SchemaField ftype = schema.getField(name);
+   * <p/>
+   * // we don't check for a null val ourselves because a solr.FieldType
+   * // might actually want to map it to something.  If createField()
+   * // returns null, then we don't store the field.
+   * <p/>
+   * Field field = ftype.createField(val, boost);
+   * if (field != null) doc.add(field);
+   * }
+   * <p/>
+   * <p/>
+   * public void addRecord(String[] fieldNames, String[] fieldValues) throws IOException {
+   * Document doc = new Document();
+   * for (int i=0; i<fieldNames.length; i++) {
+   * String name = fieldNames[i];
+   * String val = fieldNames[i];
+   * <p/>
+   * // first null is end of list.  client can reuse arrays if they want
+   * // and just write a single null if there is unused space.
+   * if (name==null) break;
+   * <p/>
+   * addField(doc,name,val);
+   * }
+   * addDocument(doc);
+   * }
+   * ****
+   */

  public void close() throws IOException {
    log.fine("Closing Writer " + name);
@ -134,7 +156,10 @@ public class SolrIndexWriter extends IndexWriter {

  @Override
  protected void finalize() {
-    try {super.close();} catch (IOException e) {}
+    try {
+      super.close();
+    } catch (IOException e) {
+    }
  }

 }
--- a/src/java/org/apache/solr/update/UpdateHandler.java
+++ b/src/java/org/apache/solr/update/UpdateHandler.java
@ -118,8 +118,7 @@ public abstract class UpdateHandler implements SolrInfoMBean {
  }

  protected SolrIndexWriter createMainIndexWriter(String name, boolean removeAllExisting) throws IOException {
-    SolrIndexWriter writer = new SolrIndexWriter(name,core.getIndexDir(), removeAllExisting, schema, core.getSolrConfig().mainIndexConfig);
-    return writer;
+    return new SolrIndexWriter(name,core.getIndexDir(), removeAllExisting, schema, core.getSolrConfig().mainIndexConfig);
  }

  protected final Term idTerm(String readableId) {
--- a/src/test/org/apache/solr/core/TestConfig.java
+++ b/src/test/org/apache/solr/core/TestConfig.java
@ -18,6 +18,7 @@
 package org.apache.solr.core;

 import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.update.SolrIndexConfig;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

@ -50,4 +51,15 @@ public class TestConfig extends AbstractSolrTestCase {
    Node node = solrConfig.getNode("propTest", true);
    assertEquals("prefix-proptwo-suffix", node.getTextContent());
  }
+
+  public void testLucene23Upgrades() throws Exception {
+    double bufferSize = solrConfig.getDouble("indexDefaults/ramBufferSizeMB");
+    assertTrue(bufferSize + " does not equal: " + 32, bufferSize == 32);
+    String mergePolicy = solrConfig.get("indexDefaults/mergePolicy");
+    assertTrue(mergePolicy + " is not equal to " + SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME, mergePolicy.equals(SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME) == true);
+    String mergeSched = solrConfig.get("indexDefaults/mergeScheduler");
+    assertTrue(mergeSched + " is not equal to " + SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME, mergeSched.equals(SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME) == true);
+    boolean luceneAutoCommit = solrConfig.getBool("indexDefaults/luceneAutoCommit");
+    assertTrue(luceneAutoCommit + " does not equal: " + false, luceneAutoCommit == false);
+  }
 }
--- a/src/test/test-files/solr/conf/solrconfig.xml
+++ b/src/test/test-files/solr/conf/solrconfig.xml
@ -32,12 +32,50 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default
   unless overridden. -->
+    <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!-- If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
+     -->
+    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
+    <!-- Tell Lucene when to flush documents to disk.
+    Giving Lucene more memory for indexing means faster indexing at the cost of more RAM
+
+    If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
+
+    -->
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
+    <writeLockTimeout>1000</writeLockTimeout>
+    <commitLockTimeout>10000</commitLockTimeout>

+    <!-- 
+     Expert: Turn on Lucene's auto commit capability.
+
+     NOTE: Despite the name, this value does not have any relation to Solr's autoCommit functionality
+
+     -->
+    <luceneAutoCommit>false</luceneAutoCommit>
+
+    <!--
+     Expert:
+     The Merge Policy in Lucene controls how merging is handled by Lucene.  The default in 2.3 is the LogByteSizeMergePolicy, previous
+     versions used LogDocMergePolicy.
+
+     LogByteSizeMergePolicy chooses segments to merge based on their size.  The Lucene 2.2 default, LogDocMergePolicy chose when
+     to merge based on number of documents
+
+     Other implementations of MergePolicy must have a no-argument constructor
+     -->
+    <mergePolicy>org.apache.lucene.index.LogByteSizeMergePolicy</mergePolicy>
+
+    <!--
+     Expert:
+     The Merge Scheduler in Lucene controls how merges are performed.  The ConcurrentMergeScheduler (Lucene 2.3 default)
+      can perform merges in the background using separate threads.  The SerialMergeScheduler (Lucene 2.2 default) does not.
+     -->
+    <mergeScheduler>org.apache.lucene.index.ConcurrentMergeScheduler</mergeScheduler>
    <!-- these are global... can't currently override per index -->
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
@ -49,7 +87,7 @@
    <!-- lucene options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>