mirror of https://github.com/apache/lucene.git
LUCENE-2588: trim unnecessary suffixes from terms in the terms dict index
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@983081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
935ec57dba
commit
a0232fe6b7
|
@ -209,6 +209,11 @@ Optimizations
|
|||
efficient copying by sub-classes. Optimized copy is implemented for RAM and FS
|
||||
streams. (Shai Erera)
|
||||
|
||||
* LUCENE-2588: Don't store unecessary suffixes when writing the terms
|
||||
index, saving RAM in IndexReader; change default terms index
|
||||
interval from 128 to 32, because the terms index now requires much
|
||||
less RAM. (Robert Muir, Mike McCandless)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-2579: Fix oal.search's package.html description of abstract
|
||||
|
|
|
@ -55,7 +55,7 @@ public final class IndexWriterConfig implements Cloneable {
|
|||
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
|
||||
|
||||
/** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */
|
||||
public static final int DEFAULT_TERM_INDEX_INTERVAL = 128;
|
||||
public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
|
||||
|
||||
/** Denotes a flush trigger is disabled. */
|
||||
public final static int DISABLE_AUTO_FLUSH = -1;
|
||||
|
|
|
@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
|
|||
final long termsStart;
|
||||
long packedIndexStart;
|
||||
long packedOffsetsStart;
|
||||
private int numTerms;
|
||||
private long numTerms;
|
||||
|
||||
// TODO: we could conceivably make a PackedInts wrapper
|
||||
// that auto-grows... then we wouldn't force 6 bytes RAM
|
||||
|
@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
|
|||
private long lastTermsPointer;
|
||||
private long totTermLength;
|
||||
|
||||
private final BytesRef lastTerm = new BytesRef();
|
||||
|
||||
SimpleFieldWriter(FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
indexStart = out.getFilePointer();
|
||||
|
@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
|
|||
// First term is first indexed term:
|
||||
if (0 == (numTerms++ % termIndexInterval)) {
|
||||
|
||||
// write full bytes
|
||||
out.writeBytes(text.bytes, text.offset, text.length);
|
||||
// we can safely strip off the non-distinguishing
|
||||
// suffix to save RAM in the loaded terms index.
|
||||
final int limit = Math.min(lastTerm.length, text.length);
|
||||
int minPrefixDiff = 1+lastTerm.length;
|
||||
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
|
||||
if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
|
||||
minPrefixDiff = byteIdx+1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// write only the min prefix that shows the diff
|
||||
// against prior term
|
||||
out.writeBytes(text.bytes, text.offset, minPrefixDiff);
|
||||
|
||||
if (termLengths.length == numIndexTerms) {
|
||||
termLengths = ArrayUtil.grow(termLengths);
|
||||
|
@ -119,14 +133,19 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
|
|||
lastTermsPointer = fp;
|
||||
|
||||
// save term length (in bytes)
|
||||
assert text.length <= Short.MAX_VALUE;
|
||||
termLengths[numIndexTerms] = (short) text.length;
|
||||
|
||||
totTermLength += text.length;
|
||||
assert minPrefixDiff <= Short.MAX_VALUE;
|
||||
termLengths[numIndexTerms] = (short) minPrefixDiff;
|
||||
totTermLength += minPrefixDiff;
|
||||
|
||||
lastTerm.copy(text);
|
||||
numIndexTerms++;
|
||||
return true;
|
||||
} else {
|
||||
if (0 == numTerms % termIndexInterval) {
|
||||
// save last term just before next index term so we
|
||||
// can compute wasted suffix
|
||||
lastTerm.copy(text);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,7 +134,7 @@ public class TestIndexWriterConfig extends LuceneTestCaseJ4 {
|
|||
public void testConstants() throws Exception {
|
||||
// Tests that the values of the constants does not change
|
||||
assertEquals(1000, IndexWriterConfig.WRITE_LOCK_TIMEOUT);
|
||||
assertEquals(128, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
|
||||
assertEquals(32, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
|
||||
assertEquals(Integer.MAX_VALUE, IndexWriterConfig.UNLIMITED_FIELD_LENGTH);
|
||||
assertEquals(-1, IndexWriterConfig.DISABLE_AUTO_FLUSH);
|
||||
assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS);
|
||||
|
|
Loading…
Reference in New Issue