SOLR-342: Added support for Lucene's new index writer options

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@634016 13f79535-47bb-0310-9956-ffa450edef68
2008-03-05 20:23:36 +00:00 · 2008-03-05 20:23:36 +00:00 · 51f1319500
parent 9c3841c013
commit 51f1319500
10 changed files with 218 additions and 61 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -228,6 +228,17 @@ Optimizations
 5. SOLR-377: Speed up response writers. (yonik)
 6. SOLR-342:  Added support into the SolrIndexWriter for using several new
 features of the new LuceneIndexWriter, including: setRAMBufferSizeMB(), setMergePolicy(), setMergeScheduler.  Also, added support
 to specify Lucene's autoCommit functionality (not to be confused with Solr's similarily named autoCommit functionality) via
  the <luceneAutoCommit> config. item.  See the test and example solrconfig.xml <indexDefaults> section for usage.  Performance during
  indexing should be significantly increased by moving up to 2.3 due to Lucene's new indexing capabilities.  Furthermore,
  the setRAMBufferSizeMB makes it more logical to decide on tuning factors related to indexing.  For best performance, leave the
  mergePolicy and mergeScheduler as the defaults and set ramBufferSizeMB instead of maxBufferedDocs.  The best value for this
    depends on the types of documents in use.  32 should be a good starting point, but reports have shown up to 48 MB provides
    good results. Note, it is acceptable to set both ramBufferSizeMB and maxBufferedDocs, and Lucene will flush based on whichever
    limit is reached first. (gsingers)
 Bug Fixes
 1. Make TextField respect sortMissingFirst and sortMissingLast fields.
    (J.J. Larrea via yonik)
--- a/client/ruby/solr-ruby/solr/conf/solrconfig.xml
+++ b/client/ruby/solr-ruby/solr/conf/solrconfig.xml
@ -34,8 +34,8 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
@ -45,8 +45,8 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
--- a/client/ruby/solr-ruby/test/conf/solrconfig.xml
+++ b/client/ruby/solr-ruby/test/conf/solrconfig.xml
@ -36,8 +36,8 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
@ -47,8 +47,8 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
    <maxBufferedDocs>1000</maxBufferedDocs>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
--- a/example/solr/conf/solrconfig.xml
+++ b/example/solr/conf/solrconfig.xml
@ -35,12 +35,53 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!--
     If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
     -->
    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
    <!-- Tell Lucene when to flush documents to disk.
    Giving Lucene more memory for indexing means faster indexing at the cost of more RAM
    If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
    -->
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
    <!--
     Expert: Turn on Lucene's auto commit capability.
     TODO: Add recommendations on why you would want to do this.
     NOTE: Despite the name, this value does not have any relation to Solr's autoCommit functionality
     -->
    <!--<luceneAutoCommit>false</luceneAutoCommit>-->
    <!--
     Expert:
     The Merge Policy in Lucene controls how merging is handled by Lucene.  The default in 2.3 is the LogByteSizeMergePolicy, previous
     versions used LogDocMergePolicy.
     LogByteSizeMergePolicy chooses segments to merge based on their size.  The Lucene 2.2 default, LogDocMergePolicy chose when
     to merge based on number of documents
     Other implementations of MergePolicy must have a no-argument constructor
     -->
    <!--<mergePolicy>org.apache.lucene.index.LogByteSizeMergePolicy</mergePolicy>-->
    <!--
     Expert:
     The Merge Scheduler in Lucene controls how merges are performed.  The ConcurrentMergeScheduler (Lucene 2.3 default)
      can perform merges in the background using separate threads.  The SerialMergeScheduler (Lucene 2.2 default) does not.
     -->
    <!--<mergeScheduler>org.apache.lucene.index.ConcurrentMergeScheduler</mergeScheduler>-->
    <!--
      As long as Solr is the only process modifying your index, it is
      safe to use Lucene's in process locking mechanism.  But you may
@ -60,8 +101,10 @@
  <mainIndex>
    <!-- options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!-- Deprecated -->
    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
--- a/src/java/org/apache/solr/core/Config.java
+++ b/src/java/org/apache/solr/core/Config.java
@ -241,6 +241,16 @@ public class Config {
    return val!=null ? Float.parseFloat(val) : def;
  }
  public double getDouble(String path){
     return Double.parseDouble(getVal(path, true));
   }
   public double getDouble(String path, double def) {
     String val = getVal(path, false);
     return val!=null ? Double.parseDouble(val) : def;
   }
  // The following functions were moved to ResourceLoader
  //-----------------------------------------------------------------------------
--- a/src/java/org/apache/solr/update/SolrIndexConfig.java
+++ b/src/java/org/apache/solr/update/SolrIndexConfig.java
@ -18,6 +18,8 @@
 package org.apache.solr.update;
 import org.apache.solr.core.SolrConfig;
 import org.apache.lucene.index.LogByteSizeMergePolicy;
 import org.apache.lucene.index.ConcurrentMergeScheduler;
 //
 // For performance reasons, we don't want to re-read
@ -30,26 +32,38 @@ import org.apache.solr.core.SolrConfig;
 public class SolrIndexConfig {
  public static final String defaultsName ="indexDefaults";
  static final SolrIndexConfig defaultDefaults = new SolrIndexConfig();
  public static final String DEFAULT_MERGE_POLICY_CLASSNAME = LogByteSizeMergePolicy.class.getName();
  public static final String DEFAULT_MERGE_SCHEDULER_CLASSNAME = ConcurrentMergeScheduler.class.getName();
  private SolrIndexConfig() {
    useCompoundFile = true;
    maxBufferedDocs = -1;
    maxMergeDocs = -1;
    mergeFactor = -1;
    ramBufferSizeMB = 16;
    maxFieldLength = -1;
    writeLockTimeout = -1;
    commitLockTimeout = -1;
    lockType = null;
    mergePolicyClassName = DEFAULT_MERGE_POLICY_CLASSNAME;
    mergeSchedulerClassname = DEFAULT_MERGE_SCHEDULER_CLASSNAME;
    luceneAutoCommit = false;
  }
  public final boolean useCompoundFile;
  public final int maxBufferedDocs;
  public final int maxMergeDocs;
  public final int mergeFactor;
  public final double ramBufferSizeMB;
  public final int maxFieldLength;
  public final int writeLockTimeout;
  public final int commitLockTimeout;
  public final String lockType;
  public final String mergePolicyClassName;
  public final String mergeSchedulerClassname;
  public final boolean luceneAutoCommit;
  public SolrIndexConfig(SolrConfig solrConfig, String prefix, SolrIndexConfig def)  {
    if (prefix == null)
@ -60,9 +74,14 @@ public class SolrIndexConfig {
    maxBufferedDocs=solrConfig.getInt(prefix+"/maxBufferedDocs",def.maxBufferedDocs);
    maxMergeDocs=solrConfig.getInt(prefix+"/maxMergeDocs",def.maxMergeDocs);
    mergeFactor=solrConfig.getInt(prefix+"/mergeFactor",def.mergeFactor);
    ramBufferSizeMB = solrConfig.getDouble(prefix+"ramBufferSizeMB", def.ramBufferSizeMB);
    maxFieldLength=solrConfig.getInt(prefix+"/maxFieldLength",def.maxFieldLength);
    writeLockTimeout=solrConfig.getInt(prefix+"/writeLockTimeout", def.writeLockTimeout);
    commitLockTimeout=solrConfig.getInt(prefix+"/commitLockTimeout", def.commitLockTimeout);
    lockType=solrConfig.get(prefix+"/lockType", def.lockType);
    mergePolicyClassName = solrConfig.get(prefix + "/mergePolicy", def.mergePolicyClassName);
    mergeSchedulerClassname = solrConfig.get(prefix + "/mergeScheduler", def.mergeSchedulerClassname);
    luceneAutoCommit = solrConfig.getBool(prefix + "/luceneAutoCommit", def.luceneAutoCommit);
  }
 }
--- a/src/java/org/apache/solr/update/SolrIndexWriter.java
+++ b/src/java/org/apache/solr/update/SolrIndexWriter.java
@ -18,6 +18,9 @@
 package org.apache.solr.update;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.MergePolicy;
 import org.apache.lucene.index.MergeScheduler;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.NativeFSLockFactory;
@ -44,7 +47,7 @@ public class SolrIndexWriter extends IndexWriter {
  String name;
  IndexSchema schema;
-  private void init(String name, IndexSchema schema, SolrIndexConfig config) {
+  private void init(String name, IndexSchema schema, SolrIndexConfig config) throws IOException {
    log.fine("Opened Writer " + name);
    this.name = name;
    this.schema = schema;
@ -53,79 +56,98 @@ public class SolrIndexWriter extends IndexWriter {
    if (config != null) {
      setUseCompoundFile(config.useCompoundFile);
-      if (config.maxBufferedDocs != -1) setMaxBufferedDocs(config.maxBufferedDocs);
+      //only set maxBufferedDocs
      if (config.maxBufferedDocs != -1) {
        setMaxBufferedDocs(config.maxBufferedDocs);
      }
      if (config.ramBufferSizeMB != -1) {
        setRAMBufferSizeMB(config.ramBufferSizeMB);
      }
      if (config.maxMergeDocs != -1) setMaxMergeDocs(config.maxMergeDocs);
      if (config.mergeFactor != -1)  setMergeFactor(config.mergeFactor);
      if (config.maxFieldLength != -1) setMaxFieldLength(config.maxFieldLength);
      if (config.mergePolicyClassName != null && SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME.equals(config.mergePolicyClassName) == false) {
        MergePolicy policy = (MergePolicy) schema.getSolrConfig().getResourceLoader().newInstance(config.mergePolicyClassName);
        setMergePolicy(policy);///hmm, is this really the best way to get a newInstance?
      }
      if (config.mergeFactor != -1 && getMergePolicy() instanceof LogMergePolicy) {
        setMergeFactor(config.mergeFactor);
      }
      if (config.mergeSchedulerClassname != null && SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME.equals(config.mergeSchedulerClassname) == false) {
        MergeScheduler scheduler = (MergeScheduler) schema.getSolrConfig().getResourceLoader().newInstance(config.mergeSchedulerClassname);
        setMergeScheduler(scheduler);
      }
      //if (config.commitLockTimeout != -1) setWriteLockTimeout(config.commitLockTimeout);
    }
  }
-  
+
  private static Directory getDirectory(String path, SolrIndexConfig config) throws IOException {
-	  Directory d = FSDirectory.getDirectory(path);
+    Directory d = FSDirectory.getDirectory(path);
    String rawLockType = (null == config) ? null : config.lockType;
    if (null == rawLockType) {
      // we default to "simple" for backwards compatiblitiy
-      log.warning("No lockType configured for "+path+" assuming 'simple'");
+      log.warning("No lockType configured for " + path + " assuming 'simple'");
      rawLockType = "simple";
    }
    final String lockType = rawLockType.toLowerCase().trim();
-    
+
-	  if ("simple".equals(lockType)) {
+    if ("simple".equals(lockType)) {
-		  d.setLockFactory(new SimpleFSLockFactory(path));
+      d.setLockFactory(new SimpleFSLockFactory(path));
-	  } else if("native".equals(lockType)) {
+    } else if ("native".equals(lockType)) {
-		  d.setLockFactory(new NativeFSLockFactory(path));
+      d.setLockFactory(new NativeFSLockFactory(path));
-	  } else if("single".equals(lockType)) {
+    } else if ("single".equals(lockType)) {
-		  d.setLockFactory(new SingleInstanceLockFactory());
+      d.setLockFactory(new SingleInstanceLockFactory());
-	  } else if("none".equals(lockType)) {
+    } else if ("none".equals(lockType)) {
-		  d.setLockFactory(new NoLockFactory());
+      d.setLockFactory(new NoLockFactory());
-	  } else {
+    } else {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
-                              "Unrecognized lockType: " + rawLockType);
+              "Unrecognized lockType: " + rawLockType);
-	  } 
+    }
-	  return d;
+    return d;
  }
  public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema) throws IOException {
-    super(getDirectory(path, null), schema.getAnalyzer(), create);
+    super(getDirectory(path, null), false, schema.getAnalyzer(), create);
    init(name, schema, null);
  }
  public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
-    super(getDirectory(path, config), schema.getAnalyzer(), create);
+    super(getDirectory(path, config), config.luceneAutoCommit, schema.getAnalyzer(), create);
-    init(name, schema,config);
+    init(name, schema, config);
  }
-  /*** use DocumentBuilder now...
+  /**
-  private final void addField(Document doc, String name, String val) {
+   * use DocumentBuilder now...
-      SchemaField ftype = schema.getField(name);
+   * private final void addField(Document doc, String name, String val) {
-
+   * SchemaField ftype = schema.getField(name);
-      // we don't check for a null val ourselves because a solr.FieldType
+   * <p/>
-      // might actually want to map it to something.  If createField()
+   * // we don't check for a null val ourselves because a solr.FieldType
-      // returns null, then we don't store the field.
+   * // might actually want to map it to something.  If createField()
-
+   * // returns null, then we don't store the field.
-      Field field = ftype.createField(val, boost);
+   * <p/>
-      if (field != null) doc.add(field);
+   * Field field = ftype.createField(val, boost);
-  }
+   * if (field != null) doc.add(field);
-
+   * }
-
+   * <p/>
-  public void addRecord(String[] fieldNames, String[] fieldValues) throws IOException {
+   * <p/>
-    Document doc = new Document();
+   * public void addRecord(String[] fieldNames, String[] fieldValues) throws IOException {
-    for (int i=0; i<fieldNames.length; i++) {
+   * Document doc = new Document();
-      String name = fieldNames[i];
+   * for (int i=0; i<fieldNames.length; i++) {
-      String val = fieldNames[i];
+   * String name = fieldNames[i];
-
+   * String val = fieldNames[i];
-      // first null is end of list.  client can reuse arrays if they want
+   * <p/>
-      // and just write a single null if there is unused space.
+   * // first null is end of list.  client can reuse arrays if they want
-      if (name==null) break;
+   * // and just write a single null if there is unused space.
-
+   * if (name==null) break;
-      addField(doc,name,val);
+   * <p/>
-    }
+   * addField(doc,name,val);
-    addDocument(doc);
+   * }
-  }
+   * addDocument(doc);
-  ******/
+   * }
   * ****
   */
  public void close() throws IOException {
    log.fine("Closing Writer " + name);
@ -134,7 +156,10 @@ public class SolrIndexWriter extends IndexWriter {
  @Override
  protected void finalize() {
-    try {super.close();} catch (IOException e) {}
+    try {
      super.close();
    } catch (IOException e) {
    }
  }
 }
--- a/src/java/org/apache/solr/update/UpdateHandler.java
+++ b/src/java/org/apache/solr/update/UpdateHandler.java
@ -118,8 +118,7 @@ public abstract class UpdateHandler implements SolrInfoMBean {
  }
  protected SolrIndexWriter createMainIndexWriter(String name, boolean removeAllExisting) throws IOException {
-    SolrIndexWriter writer = new SolrIndexWriter(name,core.getIndexDir(), removeAllExisting, schema, core.getSolrConfig().mainIndexConfig);
+    return new SolrIndexWriter(name,core.getIndexDir(), removeAllExisting, schema, core.getSolrConfig().mainIndexConfig);
    return writer;
  }
  protected final Term idTerm(String readableId) {
--- a/src/test/org/apache/solr/core/TestConfig.java
+++ b/src/test/org/apache/solr/core/TestConfig.java
@ -18,6 +18,7 @@
 package org.apache.solr.core;
 import org.apache.solr.util.AbstractSolrTestCase;
 import org.apache.solr.update.SolrIndexConfig;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@ -50,4 +51,15 @@ public class TestConfig extends AbstractSolrTestCase {
    Node node = solrConfig.getNode("propTest", true);
    assertEquals("prefix-proptwo-suffix", node.getTextContent());
  }
  public void testLucene23Upgrades() throws Exception {
    double bufferSize = solrConfig.getDouble("indexDefaults/ramBufferSizeMB");
    assertTrue(bufferSize + " does not equal: " + 32, bufferSize == 32);
    String mergePolicy = solrConfig.get("indexDefaults/mergePolicy");
    assertTrue(mergePolicy + " is not equal to " + SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME, mergePolicy.equals(SolrIndexConfig.DEFAULT_MERGE_POLICY_CLASSNAME) == true);
    String mergeSched = solrConfig.get("indexDefaults/mergeScheduler");
    assertTrue(mergeSched + " is not equal to " + SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME, mergeSched.equals(SolrIndexConfig.DEFAULT_MERGE_SCHEDULER_CLASSNAME) == true);
    boolean luceneAutoCommit = solrConfig.getBool("indexDefaults/luceneAutoCommit");
    assertTrue(luceneAutoCommit + " does not equal: " + false, luceneAutoCommit == false);
  }
 }
--- a/src/test/test-files/solr/conf/solrconfig.xml
+++ b/src/test/test-files/solr/conf/solrconfig.xml
@ -32,12 +32,50 @@
  <indexDefaults>
   <!-- Values here affect all index writers and act as a default
   unless overridden. -->
    <!-- Values here affect all index writers and act as a default unless overridden. -->
    <useCompoundFile>false</useCompoundFile>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <!-- If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
     -->
    <!--<maxBufferedDocs>1000</maxBufferedDocs>-->
    <!-- Tell Lucene when to flush documents to disk.
    Giving Lucene more memory for indexing means faster indexing at the cost of more RAM
    If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
    -->
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
    <!-- 
     Expert: Turn on Lucene's auto commit capability.
     NOTE: Despite the name, this value does not have any relation to Solr's autoCommit functionality
     -->
    <luceneAutoCommit>false</luceneAutoCommit>
    <!--
     Expert:
     The Merge Policy in Lucene controls how merging is handled by Lucene.  The default in 2.3 is the LogByteSizeMergePolicy, previous
     versions used LogDocMergePolicy.
     LogByteSizeMergePolicy chooses segments to merge based on their size.  The Lucene 2.2 default, LogDocMergePolicy chose when
     to merge based on number of documents
     Other implementations of MergePolicy must have a no-argument constructor
     -->
    <mergePolicy>org.apache.lucene.index.LogByteSizeMergePolicy</mergePolicy>
    <!--
     Expert:
     The Merge Scheduler in Lucene controls how merges are performed.  The ConcurrentMergeScheduler (Lucene 2.3 default)
      can perform merges in the background using separate threads.  The SerialMergeScheduler (Lucene 2.2 default) does not.
     -->
    <mergeScheduler>org.apache.lucene.index.ConcurrentMergeScheduler</mergeScheduler>
    <!-- these are global... can't currently override per index -->
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
@ -49,7 +87,7 @@
    <!-- lucene options specific to the main on-disk lucene index -->
    <useCompoundFile>false</useCompoundFile>
    <mergeFactor>10</mergeFactor>
-    <maxBufferedDocs>1000</maxBufferedDocs>
+    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxMergeDocs>2147483647</maxMergeDocs>
    <maxFieldLength>10000</maxFieldLength>