diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 712e0275031..10ff754b55b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -209,6 +209,11 @@ Optimizations efficient copying by sub-classes. Optimized copy is implemented for RAM and FS streams. (Shai Erera) +* LUCENE-2588: Don't store unecessary suffixes when writing the terms + index, saving RAM in IndexReader; change default terms index + interval from 128 to 32, because the terms index now requires much + less RAM. (Robert Muir, Mike McCandless) + Documentation * LUCENE-2579: Fix oal.search's package.html description of abstract diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java index bc12835ca94..b805b7a9930 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -55,7 +55,7 @@ public final class IndexWriterConfig implements Cloneable { public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND } /** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */ - public static final int DEFAULT_TERM_INDEX_INTERVAL = 128; + public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; /** Denotes a flush trigger is disabled. */ public final static int DISABLE_AUTO_FLUSH = -1; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java index df7fd85c5c2..58f5d4b29ba 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java @@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter { final long termsStart; long packedIndexStart; long packedOffsetsStart; - private int numTerms; + private long numTerms; // TODO: we could conceivably make a PackedInts wrapper // that auto-grows... then we wouldn't force 6 bytes RAM @@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter { private long lastTermsPointer; private long totTermLength; + private final BytesRef lastTerm = new BytesRef(); + SimpleFieldWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; indexStart = out.getFilePointer(); @@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter { // First term is first indexed term: if (0 == (numTerms++ % termIndexInterval)) { - // write full bytes - out.writeBytes(text.bytes, text.offset, text.length); + // we can safely strip off the non-distinguishing + // suffix to save RAM in the loaded terms index. + final int limit = Math.min(lastTerm.length, text.length); + int minPrefixDiff = 1+lastTerm.length; + for(int byteIdx=0;byteIdx