LUCENE-2588: Exposed indexed term prefix length to enable none-unicode sort order term indexes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@998675 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2010-09-19 14:35:16 +00:00
parent 7fe5f4bef7
commit 68776ee5d6
2 changed files with 32 additions and 15 deletions

View File

@ -76,6 +76,24 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
return writer; return writer;
} }
/** NOTE: if your codec does not sort in unicode code
* point order, you must override this method, to simply
* return indexedTerm.length. */
protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
// As long as codec sorts terms in unicode codepoint
// order, we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
final int idxTermOffset = indexedTerm.offset;
final int priorTermOffset = priorTerm.offset;
final int limit = Math.min(priorTerm.length, indexedTerm.length);
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
if (priorTerm.bytes[priorTermOffset+byteIdx] != indexedTerm.bytes[idxTermOffset+byteIdx]) {
return byteIdx+1;
}
}
return Math.min(1+priorTerm.length, indexedTerm.length);
}
private class SimpleFieldWriter extends FieldWriter { private class SimpleFieldWriter extends FieldWriter {
final FieldInfo fieldInfo; final FieldInfo fieldInfo;
int numIndexTerms; int numIndexTerms;
@ -108,20 +126,11 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
// First term is first indexed term: // First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) { if (0 == (numTerms++ % termIndexInterval)) {
// we can safely strip off the non-distinguishing final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
// suffix to save RAM in the loaded terms index.
final int limit = Math.min(lastTerm.length, text.length);
int minPrefixDiff = Math.min(1+lastTerm.length, text.length);
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
minPrefixDiff = byteIdx+1;
break;
}
}
// write only the min prefix that shows the diff // write only the min prefix that shows the diff
// against prior term // against prior term
out.writeBytes(text.bytes, text.offset, minPrefixDiff); out.writeBytes(text.bytes, text.offset, indexedTermLength);
if (termLengths.length == numIndexTerms) { if (termLengths.length == numIndexTerms) {
termLengths = ArrayUtil.grow(termLengths); termLengths = ArrayUtil.grow(termLengths);
@ -136,9 +145,9 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
lastTermsPointer = fp; lastTermsPointer = fp;
// save term length (in bytes) // save term length (in bytes)
assert minPrefixDiff <= Short.MAX_VALUE; assert indexedTermLength <= Short.MAX_VALUE;
termLengths[numIndexTerms] = (short) minPrefixDiff; termLengths[numIndexTerms] = (short) indexedTermLength;
totTermLength += minPrefixDiff; totTermLength += indexedTermLength;
lastTerm.copy(text); lastTerm.copy(text);
numIndexTerms++; numIndexTerms++;

View File

@ -508,7 +508,15 @@ public class TestExternalCodecs extends LuceneTestCase {
TermsIndexWriterBase indexWriter; TermsIndexWriterBase indexWriter;
boolean success = false; boolean success = false;
try { try {
indexWriter = new FixedGapTermsIndexWriter(state); indexWriter = new FixedGapTermsIndexWriter(state) {
// We sort in reverse unicode order, so, we must
// disable the suffix-stripping opto that
// FixedGapTermsIndexWriter does by default!
@Override
protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) {
return indexedTerm.length;
}
};
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {