LUCENE-2588: trim unnecessary suffixes from terms in the terms dict index

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@983081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-08-06 18:13:17 +00:00
parent 935ec57dba
commit a0232fe6b7
4 changed files with 33 additions and 9 deletions

View File

@ -209,6 +209,11 @@ Optimizations
efficient copying by sub-classes. Optimized copy is implemented for RAM and FS
streams. (Shai Erera)
* LUCENE-2588: Don't store unecessary suffixes when writing the terms
index, saving RAM in IndexReader; change default terms index
interval from 128 to 32, because the terms index now requires much
less RAM. (Robert Muir, Mike McCandless)
Documentation
* LUCENE-2579: Fix oal.search's package.html description of abstract

View File

@ -55,7 +55,7 @@ public final class IndexWriterConfig implements Cloneable {
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
/** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */
public static final int DEFAULT_TERM_INDEX_INTERVAL = 128;
public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;

View File

@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
final long termsStart;
long packedIndexStart;
long packedOffsetsStart;
private int numTerms;
private long numTerms;
// TODO: we could conceivably make a PackedInts wrapper
// that auto-grows... then we wouldn't force 6 bytes RAM
@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
private long lastTermsPointer;
private long totTermLength;
private final BytesRef lastTerm = new BytesRef();
SimpleFieldWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
// write full bytes
out.writeBytes(text.bytes, text.offset, text.length);
// we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
final int limit = Math.min(lastTerm.length, text.length);
int minPrefixDiff = 1+lastTerm.length;
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
minPrefixDiff = byteIdx+1;
break;
}
}
// write only the min prefix that shows the diff
// against prior term
out.writeBytes(text.bytes, text.offset, minPrefixDiff);
if (termLengths.length == numIndexTerms) {
termLengths = ArrayUtil.grow(termLengths);
@ -119,14 +133,19 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
lastTermsPointer = fp;
// save term length (in bytes)
assert text.length <= Short.MAX_VALUE;
termLengths[numIndexTerms] = (short) text.length;
totTermLength += text.length;
assert minPrefixDiff <= Short.MAX_VALUE;
termLengths[numIndexTerms] = (short) minPrefixDiff;
totTermLength += minPrefixDiff;
lastTerm.copy(text);
numIndexTerms++;
return true;
} else {
if (0 == numTerms % termIndexInterval) {
// save last term just before next index term so we
// can compute wasted suffix
lastTerm.copy(text);
}
return false;
}
}

View File

@ -134,7 +134,7 @@ public class TestIndexWriterConfig extends LuceneTestCaseJ4 {
public void testConstants() throws Exception {
// Tests that the values of the constants does not change
assertEquals(1000, IndexWriterConfig.WRITE_LOCK_TIMEOUT);
assertEquals(128, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
assertEquals(32, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
assertEquals(Integer.MAX_VALUE, IndexWriterConfig.UNLIMITED_FIELD_LENGTH);
assertEquals(-1, IndexWriterConfig.DISABLE_AUTO_FLUSH);
assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS);