mirror of https://github.com/apache/lucene.git
LUCENE-2588: Exposed indexed term prefix length to enable none-unicode sort order term indexes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@998675 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7fe5f4bef7
commit
68776ee5d6
|
@ -76,6 +76,24 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
return writer;
|
||||
}
|
||||
|
||||
/** NOTE: if your codec does not sort in unicode code
|
||||
* point order, you must override this method, to simply
|
||||
* return indexedTerm.length. */
|
||||
protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
|
||||
// As long as codec sorts terms in unicode codepoint
|
||||
// order, we can safely strip off the non-distinguishing
|
||||
// suffix to save RAM in the loaded terms index.
|
||||
final int idxTermOffset = indexedTerm.offset;
|
||||
final int priorTermOffset = priorTerm.offset;
|
||||
final int limit = Math.min(priorTerm.length, indexedTerm.length);
|
||||
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
|
||||
if (priorTerm.bytes[priorTermOffset+byteIdx] != indexedTerm.bytes[idxTermOffset+byteIdx]) {
|
||||
return byteIdx+1;
|
||||
}
|
||||
}
|
||||
return Math.min(1+priorTerm.length, indexedTerm.length);
|
||||
}
|
||||
|
||||
private class SimpleFieldWriter extends FieldWriter {
|
||||
final FieldInfo fieldInfo;
|
||||
int numIndexTerms;
|
||||
|
@ -108,20 +126,11 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
// First term is first indexed term:
|
||||
if (0 == (numTerms++ % termIndexInterval)) {
|
||||
|
||||
// we can safely strip off the non-distinguishing
|
||||
// suffix to save RAM in the loaded terms index.
|
||||
final int limit = Math.min(lastTerm.length, text.length);
|
||||
int minPrefixDiff = Math.min(1+lastTerm.length, text.length);
|
||||
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
|
||||
if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
|
||||
minPrefixDiff = byteIdx+1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
|
||||
|
||||
// write only the min prefix that shows the diff
|
||||
// against prior term
|
||||
out.writeBytes(text.bytes, text.offset, minPrefixDiff);
|
||||
out.writeBytes(text.bytes, text.offset, indexedTermLength);
|
||||
|
||||
if (termLengths.length == numIndexTerms) {
|
||||
termLengths = ArrayUtil.grow(termLengths);
|
||||
|
@ -136,9 +145,9 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
lastTermsPointer = fp;
|
||||
|
||||
// save term length (in bytes)
|
||||
assert minPrefixDiff <= Short.MAX_VALUE;
|
||||
termLengths[numIndexTerms] = (short) minPrefixDiff;
|
||||
totTermLength += minPrefixDiff;
|
||||
assert indexedTermLength <= Short.MAX_VALUE;
|
||||
termLengths[numIndexTerms] = (short) indexedTermLength;
|
||||
totTermLength += indexedTermLength;
|
||||
|
||||
lastTerm.copy(text);
|
||||
numIndexTerms++;
|
||||
|
|
|
@ -508,7 +508,15 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
TermsIndexWriterBase indexWriter;
|
||||
boolean success = false;
|
||||
try {
|
||||
indexWriter = new FixedGapTermsIndexWriter(state);
|
||||
indexWriter = new FixedGapTermsIndexWriter(state) {
|
||||
// We sort in reverse unicode order, so, we must
|
||||
// disable the suffix-stripping opto that
|
||||
// FixedGapTermsIndexWriter does by default!
|
||||
@Override
|
||||
protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) {
|
||||
return indexedTerm.length;
|
||||
}
|
||||
};
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
|
Loading…
Reference in New Issue