LUCENE-2588: trim unnecessary suffixes from terms in the terms dict index

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@983081 13f79535-47bb-0310-9956-ffa450edef68
2010-08-06 18:13:17 +00:00 · 2010-08-06 18:13:17 +00:00 · a0232fe6b7
parent 935ec57dba
commit a0232fe6b7
4 changed files with 33 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -209,6 +209,11 @@ Optimizations
  efficient copying by sub-classes. Optimized copy is implemented for RAM and FS
  streams. (Shai Erera)

+* LUCENE-2588: Don't store unecessary suffixes when writing the terms
+  index, saving RAM in IndexReader; change default terms index
+  interval from 128 to 32, because the terms index now requires much
+  less RAM.  (Robert Muir, Mike McCandless)
+
 Documentation

 * LUCENE-2579: Fix oal.search's package.html description of abstract
--- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
@ -55,7 +55,7 @@ public final class IndexWriterConfig implements Cloneable {
  public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
  
  /** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */
-  public static final int DEFAULT_TERM_INDEX_INTERVAL = 128;
+  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;

  /** Denotes a flush trigger is disabled. */
  public final static int DISABLE_AUTO_FLUSH = -1;
--- a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
    final long termsStart;
    long packedIndexStart;
    long packedOffsetsStart;
-    private int numTerms;
+    private long numTerms;

    // TODO: we could conceivably make a PackedInts wrapper
    // that auto-grows... then we wouldn't force 6 bytes RAM
@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
    private long lastTermsPointer;
    private long totTermLength;

+    private final BytesRef lastTerm = new BytesRef();
+
    SimpleFieldWriter(FieldInfo fieldInfo) {
      this.fieldInfo = fieldInfo;
      indexStart = out.getFilePointer();
@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
      // First term is first indexed term:
      if (0 == (numTerms++ % termIndexInterval)) {

-        // write full bytes
-        out.writeBytes(text.bytes, text.offset, text.length);
+        // we can safely strip off the non-distinguishing
+        // suffix to save RAM in the loaded terms index.
+        final int limit = Math.min(lastTerm.length, text.length);
+        int minPrefixDiff = 1+lastTerm.length;
+        for(int byteIdx=0;byteIdx<limit;byteIdx++) {
+          if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
+            minPrefixDiff = byteIdx+1;
+            break;
+          }
+        }
+
+        // write only the min prefix that shows the diff
+        // against prior term
+        out.writeBytes(text.bytes, text.offset, minPrefixDiff);

        if (termLengths.length == numIndexTerms) {
          termLengths = ArrayUtil.grow(termLengths);
@ -119,14 +133,19 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
        lastTermsPointer = fp;

        // save term length (in bytes)
-        assert text.length <= Short.MAX_VALUE;
-        termLengths[numIndexTerms] = (short) text.length;
-
-        totTermLength += text.length;
+        assert minPrefixDiff <= Short.MAX_VALUE;
+        termLengths[numIndexTerms] = (short) minPrefixDiff;
+        totTermLength += minPrefixDiff;

+        lastTerm.copy(text);
        numIndexTerms++;
        return true;
      } else {
+        if (0 == numTerms % termIndexInterval) {
+          // save last term just before next index term so we
+          // can compute wasted suffix
+          lastTerm.copy(text);
+        }
        return false;
      }
    }
--- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
@ -134,7 +134,7 @@ public class TestIndexWriterConfig extends LuceneTestCaseJ4 {
  public void testConstants() throws Exception {
    // Tests that the values of the constants does not change
    assertEquals(1000, IndexWriterConfig.WRITE_LOCK_TIMEOUT);
-    assertEquals(128, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
+    assertEquals(32, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
    assertEquals(Integer.MAX_VALUE, IndexWriterConfig.UNLIMITED_FIELD_LENGTH);
    assertEquals(-1, IndexWriterConfig.DISABLE_AUTO_FLUSH);
    assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS);