Use Arrays.compareUnsigned instead of loop compare suffix.

This commit is contained in:
zhouhui 2024-09-18 14:36:49 +08:00
parent 4d3d219146
commit 2cbf5ff761
4 changed files with 209 additions and 310 deletions

View File

@ -184,7 +184,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
// " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + // " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" +
// f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" +
// term.length + " vs prefix=" + f.prefix); // term.length + " vs prefix=" + f.prefix);
if (f.prefix > targetBeforeCurrentLength) { if (f.prefixLength > targetBeforeCurrentLength) {
// System.out.println(" do rewind!"); // System.out.println(" do rewind!");
f.rewind(); f.rewind();
} else { } else {
@ -192,11 +192,11 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" skip rewind!"); // System.out.println(" skip rewind!");
// } // }
} }
assert length == f.prefix; assert length == f.prefixLength;
assert termOrd == f.termOrdOrig; assert termOrd == f.termOrdOrig;
} else { } else {
f.nextEnt = -1; f.nextEnt = -1;
f.prefix = length; f.prefixLength = length;
f.state.termBlockOrd = 0; f.state.termBlockOrd = 0;
f.termOrdOrig = termOrd; f.termOrdOrig = termOrd;
// System.out.println("set termOrdOrig=" + termOrd); // System.out.println("set termOrdOrig=" + termOrd);
@ -412,7 +412,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
// toHex(targetLabel)); // toHex(targetLabel));
// } // }
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -472,7 +472,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
} }
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -686,7 +686,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
// toHex(targetLabel)); // toHex(targetLabel));
// } // }
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -747,7 +747,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
} }
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -785,7 +785,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
while (true) { while (true) {
OrdsSegmentTermsEnumFrame f = getFrame(ord); OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null; assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefixLength);
if (f.nextEnt == -1) { if (f.nextEnt == -1) {
out.println( out.println(
" frame " " frame "
@ -796,7 +796,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
+ f.fp + f.fp
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
+ " prefixLen=" + " prefixLen="
+ f.prefix + f.prefixLength
+ " prefix=" + " prefix="
+ ToStringUtils.bytesRefToString(prefix) + ToStringUtils.bytesRefToString(prefix)
+ (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
@ -826,7 +826,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
+ f.fp + f.fp
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
+ " prefixLen=" + " prefixLen="
+ f.prefix + f.prefixLength
+ " prefix=" + " prefix="
+ ToStringUtils.bytesRefToString(prefix) + ToStringUtils.bytesRefToString(prefix)
+ " nextEnt=" + " nextEnt="
@ -853,12 +853,14 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
} }
if (fr.index != null) { if (fr.index != null) {
assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) { if (f.prefixLength > 0
&& isSeekFrame
&& f.arc.label() != (term.byteAt(f.prefixLength - 1) & 0xFF)) {
out.println( out.println(
" broken seek state: arc.label=" " broken seek state: arc.label="
+ (char) f.arc.label() + (char) f.arc.label()
+ " vs term byte=" + " vs term byte="
+ (char) (term.byteAt(f.prefix - 1) & 0xFF)); + (char) (term.byteAt(f.prefixLength - 1) & 0xFF));
throw new RuntimeException("seek state is broken"); throw new RuntimeException("seek state is broken");
} }
Output output = Util.get(fr.index, prefix); Output output = Util.get(fr.index, prefix);
@ -887,7 +889,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
if (f == currentFrame) { if (f == currentFrame) {
break; break;
} }
if (f.prefix == validIndexPrefix) { if (f.prefixLength == validIndexPrefix) {
isSeekFrame = false; isSeekFrame = false;
} }
ord++; ord++;
@ -969,7 +971,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
// Note that the seek state (last seek) has been // Note that the seek state (last seek) has been
// invalidated beyond this depth // invalidated beyond this depth
validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength);
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" reset validIndexPrefix=" + validIndexPrefix); // System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
// } // }

View File

@ -17,6 +17,7 @@
package org.apache.lucene.codecs.blocktreeords; package org.apache.lucene.codecs.blocktreeords;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
@ -54,7 +55,7 @@ final class OrdsSegmentTermsEnumFrame {
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block // Length of prefix shared by all terms in this block
int prefix; int prefixLength;
// Number of entries (term or sub-block) in this block // Number of entries (term or sub-block) in this block
int entCount; int entCount;
@ -295,11 +296,11 @@ final class OrdsSegmentTermsEnumFrame {
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp + " termOrd=" + termOrd; : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp + " termOrd=" + termOrd;
nextEnt++; nextEnt++;
termOrd++; termOrd++;
suffix = suffixesReader.readVInt(); suffixLength = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(ste.term.length()); ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
// A normal term // A normal term
ste.termExists = true; ste.termExists = true;
return false; return false;
@ -312,11 +313,11 @@ final class OrdsSegmentTermsEnumFrame {
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++; nextEnt++;
final int code = suffixesReader.readVInt(); final int code = suffixesReader.readVInt();
suffix = code >>> 1; suffixLength = code >>> 1;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(ste.term.length()); ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
if ((code & 1) == 0) { if ((code & 1) == 0) {
// A normal term // A normal term
ste.termExists = true; ste.termExists = true;
@ -342,7 +343,7 @@ final class OrdsSegmentTermsEnumFrame {
// floor blocks we "typically" get // floor blocks we "typically" get
public void scanToFloorFrame(BytesRef target) { public void scanToFloorFrame(BytesRef target) {
if (!isFloor || target.length <= prefix) { if (!isFloor || target.length <= prefixLength) {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" +
// target.length + " vs prefix=" + prefix); // target.length + " vs prefix=" + prefix);
@ -350,7 +351,7 @@ final class OrdsSegmentTermsEnumFrame {
return; return;
} }
final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF;
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char)
@ -532,7 +533,7 @@ final class OrdsSegmentTermsEnumFrame {
// Used only by assert // Used only by assert
private boolean prefixMatches(BytesRef target) { private boolean prefixMatches(BytesRef target) {
for (int bytePos = 0; bytePos < prefix; bytePos++) { for (int bytePos = 0; bytePos < prefixLength; bytePos++) {
if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) {
return false; return false;
} }
@ -586,7 +587,7 @@ final class OrdsSegmentTermsEnumFrame {
} }
private int startBytePos; private int startBytePos;
private int suffix; private int suffixLength;
private long subCode; private long subCode;
// Target's prefix matches this block's prefix; we // Target's prefix matches this block's prefix; we
@ -613,13 +614,11 @@ final class OrdsSegmentTermsEnumFrame {
assert prefixMatches(target); assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block: // Loop over each entry (term or sub-block) in this block:
// nextTerm: while(nextEnt < entCount) { do {
nextTerm:
while (true) {
nextEnt++; nextEnt++;
termOrd++; termOrd++;
suffix = suffixesReader.readVInt(); suffixLength = suffixesReader.readVInt();
// if (DEBUG) { // if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef(); // BytesRef suffixBytesRef = new BytesRef();
@ -630,63 +629,41 @@ final class OrdsSegmentTermsEnumFrame {
// + ToStringUtils.bytesRefToString(suffixBytesRef)); // + ToStringUtils.bytesRefToString(suffixBytesRef));
// } // }
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix); suffixesReader.skipBytes(suffixLength);
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); // Compare suffix and target.
int targetPos = target.offset + prefix; final int cmp =
Arrays.compareUnsigned(
// Loop over bytes in the suffix, comparing to suffixBytes,
// the target startBytePos,
int bytePos = startBytePos; startBytePos + suffixLength,
while (true) { target.bytes,
final int cmp; target.offset + prefixLength,
final boolean stop; target.offset + target.length);
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) { if (cmp < 0) {
// Current entry is still before the target; // Current entry is still before the target;
// keep scanning // keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) { } else if (cmp > 0) {
// Done! Current entry is after target -- // Done! Current entry is after target --
// return NOT_FOUND: // return NOT_FOUND:
fillTerm(); fillTerm();
// if (DEBUG) System.out.println(" not found"); // if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} else if (stop) { } else {
// Exact match! // Exact match!
// This cannot be a sub-block because we // This cannot be a sub-block because we
// would have followed the index to this // would have followed the index to this
// sub-block from the start: // sub-block from the start:
assert ste.termExists;
fillTerm(); fillTerm();
// if (DEBUG) System.out.println(" found!"); // if (DEBUG) System.out.println(" found!");
return SeekStatus.FOUND; return SeekStatus.FOUND;
} }
} } while (nextEnt < entCount);
}
// It is possible (and OK) that terms index pointed us // It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and // at this block, but, we scanned the entire block and
@ -730,13 +707,11 @@ final class OrdsSegmentTermsEnumFrame {
assert prefixMatches(target); assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block: // Loop over each entry (term or sub-block) in this block:
// nextTerm: while(nextEnt < entCount) { while (nextEnt < entCount) {
nextTerm:
while (true) {
nextEnt++; nextEnt++;
final int code = suffixesReader.readVInt(); final int code = suffixesReader.readVInt();
suffix = code >>> 1; suffixLength = code >>> 1;
// if (DEBUG) { // if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef(); // BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes; // suffixBytesRef.bytes = suffixBytes;
@ -748,9 +723,8 @@ final class OrdsSegmentTermsEnumFrame {
// } // }
ste.termExists = (code & 1) == 0; ste.termExists = (code & 1) == 0;
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix); suffixesReader.skipBytes(suffixLength);
// Must save ord before we skip over a sub-block in case we push, below: // Must save ord before we skip over a sub-block in case we push, below:
long prevTermOrd = termOrd; long prevTermOrd = termOrd;
if (ste.termExists) { if (ste.termExists) {
@ -763,40 +737,20 @@ final class OrdsSegmentTermsEnumFrame {
lastSubFP = fp - subCode; lastSubFP = fp - subCode;
} }
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); // Compare suffix and target.
int targetPos = target.offset + prefix; final int cmp =
Arrays.compareUnsigned(
// Loop over bytes in the suffix, comparing to suffixBytes,
// the target startBytePos,
int bytePos = startBytePos; startBytePos + suffixLength,
while (true) { target.bytes,
final int cmp; target.offset + prefixLength,
final boolean stop; target.offset + target.length);
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) { if (cmp < 0) {
// Current entry is still before the target; // Current entry is still before the target;
// keep scanning // keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
// termExists = true;
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) { } else if (cmp > 0) {
// Done! Current entry is after target -- // Done! Current entry is after target --
// return NOT_FOUND: // return NOT_FOUND:
fillTerm(); fillTerm();
@ -807,7 +761,8 @@ final class OrdsSegmentTermsEnumFrame {
// the target, so we must recurse into the // the target, so we must recurse into the
// sub-frame(s): // sub-frame(s):
ste.currentFrame = ste.currentFrame =
ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen, prevTermOrd); ste.pushFrame(
null, ste.currentFrame.lastSubFP, prefixLength + suffixLength, prevTermOrd);
ste.currentFrame.loadBlock(); ste.currentFrame.loadBlock();
while (ste.currentFrame.next()) { while (ste.currentFrame.next()) {
ste.currentFrame = ste.currentFrame =
@ -818,7 +773,7 @@ final class OrdsSegmentTermsEnumFrame {
// if (DEBUG) System.out.println(" not found"); // if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} else if (stop) { } else {
// Exact match! // Exact match!
// This cannot be a sub-block because we // This cannot be a sub-block because we
@ -831,7 +786,6 @@ final class OrdsSegmentTermsEnumFrame {
return SeekStatus.FOUND; return SeekStatus.FOUND;
} }
} }
}
// It is possible (and OK) that terms index pointed us // It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and // at this block, but, we scanned the entire block and
@ -854,9 +808,9 @@ final class OrdsSegmentTermsEnumFrame {
} }
private void fillTerm() { private void fillTerm() {
final int termLength = prefix + suffix; final int termLength = prefixLength + suffixLength;
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(termLength); ste.term.grow(termLength);
System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength);
} }
} }

View File

@ -181,17 +181,17 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
// " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + // " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" +
// f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" +
// term.length + " vs prefix=" + f.prefix); // term.length + " vs prefix=" + f.prefix);
if (f.prefix > targetBeforeCurrentLength) { if (f.prefixLength > targetBeforeCurrentLength) {
f.rewind(); f.rewind();
} else { } else {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" skip rewind!"); // System.out.println(" skip rewind!");
// } // }
} }
assert length == f.prefix; assert length == f.prefixLength;
} else { } else {
f.nextEnt = -1; f.nextEnt = -1;
f.prefix = length; f.prefixLength = length;
f.state.termBlockOrd = 0; f.state.termBlockOrd = 0;
f.fpOrig = f.fp = fp; f.fpOrig = f.fp = fp;
f.lastSubFP = -1; f.lastSubFP = -1;
@ -459,7 +459,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
// Integer.toHexString(targetLabel) + " termExists=" + termExists); // Integer.toHexString(targetLabel) + " termExists=" + termExists);
// } // }
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -573,7 +573,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
} }
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -802,7 +802,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
// toHex(targetLabel)); // toHex(targetLabel));
// } // }
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -863,7 +863,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
} }
// validIndexPrefix = targetUpto; // validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix; validIndexPrefix = currentFrame.prefixLength;
currentFrame.scanToFloorFrame(target); currentFrame.scanToFloorFrame(target);
@ -901,7 +901,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
while (true) { while (true) {
IDVersionSegmentTermsEnumFrame f = getFrame(ord); IDVersionSegmentTermsEnumFrame f = getFrame(ord);
assert f != null; assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefixLength);
if (f.nextEnt == -1) { if (f.nextEnt == -1) {
out.println( out.println(
" frame " " frame "
@ -912,7 +912,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
+ f.fp + f.fp
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
+ " prefixLen=" + " prefixLen="
+ f.prefix + f.prefixLength
+ " prefix=" + " prefix="
+ ToStringUtils.bytesRefToString(prefix) + ToStringUtils.bytesRefToString(prefix)
+ (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
@ -940,7 +940,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
+ f.fp + f.fp
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
+ " prefixLen=" + " prefixLen="
+ f.prefix + f.prefixLength
+ " prefix=" + " prefix="
+ ToStringUtils.bytesRefToString(prefix) + ToStringUtils.bytesRefToString(prefix)
+ " nextEnt=" + " nextEnt="
@ -965,12 +965,14 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
} }
if (fr.index != null) { if (fr.index != null) {
assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) { if (f.prefixLength > 0
&& isSeekFrame
&& f.arc.label() != (term.byteAt(f.prefixLength - 1) & 0xFF)) {
out.println( out.println(
" broken seek state: arc.label=" " broken seek state: arc.label="
+ (char) f.arc.label() + (char) f.arc.label()
+ " vs term byte=" + " vs term byte="
+ (char) (term.byteAt(f.prefix - 1) & 0xFF)); + (char) (term.byteAt(f.prefixLength - 1) & 0xFF));
throw new RuntimeException("seek state is broken"); throw new RuntimeException("seek state is broken");
} }
Pair<BytesRef, Long> output = Util.get(fr.index, prefix); Pair<BytesRef, Long> output = Util.get(fr.index, prefix);
@ -999,7 +1001,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
if (f == currentFrame) { if (f == currentFrame) {
break; break;
} }
if (f.prefix == validIndexPrefix) { if (f.prefixLength == validIndexPrefix) {
isSeekFrame = false; isSeekFrame = false;
} }
ord++; ord++;
@ -1079,7 +1081,7 @@ public final class IDVersionSegmentTermsEnum extends BaseTermsEnum {
// Note that the seek state (last seek) has been // Note that the seek state (last seek) has been
// invalidated beyond this depth // invalidated beyond this depth
validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength);
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" reset validIndexPrefix=" + validIndexPrefix); // System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
// } // }

View File

@ -17,6 +17,7 @@
package org.apache.lucene.sandbox.codecs.idversion; package org.apache.lucene.sandbox.codecs.idversion;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
@ -52,7 +53,7 @@ final class IDVersionSegmentTermsEnumFrame {
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block // Length of prefix shared by all terms in this block
int prefix; int prefixLength;
// Number of entries (term or sub-block) in this block // Number of entries (term or sub-block) in this block
int entCount; int entCount;
@ -262,11 +263,11 @@ final class IDVersionSegmentTermsEnumFrame {
assert nextEnt != -1 && nextEnt < entCount assert nextEnt != -1 && nextEnt < entCount
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++; nextEnt++;
suffix = suffixesReader.readVInt(); suffixLength = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(ste.term.length()); ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
// A normal term // A normal term
ste.termExists = true; ste.termExists = true;
return false; return false;
@ -279,11 +280,11 @@ final class IDVersionSegmentTermsEnumFrame {
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++; nextEnt++;
final int code = suffixesReader.readVInt(); final int code = suffixesReader.readVInt();
suffix = code >>> 1; suffixLength = code >>> 1;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(ste.term.length()); ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
if ((code & 1) == 0) { if ((code & 1) == 0) {
// A normal term // A normal term
ste.termExists = true; ste.termExists = true;
@ -307,7 +308,7 @@ final class IDVersionSegmentTermsEnumFrame {
// floor blocks we "typically" get // floor blocks we "typically" get
public void scanToFloorFrame(BytesRef target) { public void scanToFloorFrame(BytesRef target) {
if (!isFloor || target.length <= prefix) { if (!isFloor || target.length <= prefixLength) {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" +
// target.length + " vs prefix=" + prefix); // target.length + " vs prefix=" + prefix);
@ -315,7 +316,7 @@ final class IDVersionSegmentTermsEnumFrame {
return; return;
} }
final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF;
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char)
@ -415,7 +416,7 @@ final class IDVersionSegmentTermsEnumFrame {
// Used only by assert // Used only by assert
private boolean prefixMatches(BytesRef target) { private boolean prefixMatches(BytesRef target) {
for (int bytePos = 0; bytePos < prefix; bytePos++) { for (int bytePos = 0; bytePos < prefixLength; bytePos++) {
if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) {
return false; return false;
} }
@ -466,7 +467,7 @@ final class IDVersionSegmentTermsEnumFrame {
} }
private int startBytePos; private int startBytePos;
private int suffix; private int suffixLength;
private long subCode; private long subCode;
// Target's prefix matches this block's prefix; we // Target's prefix matches this block's prefix; we
@ -493,12 +494,10 @@ final class IDVersionSegmentTermsEnumFrame {
assert prefixMatches(target); assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block: // Loop over each entry (term or sub-block) in this block:
// nextTerm: while(nextEnt < entCount) { do {
nextTerm:
while (true) {
nextEnt++; nextEnt++;
suffix = suffixesReader.readVInt(); suffixLength = suffixesReader.readVInt();
// if (DEBUG) { // if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef(); // BytesRef suffixBytesRef = new BytesRef();
@ -509,76 +508,41 @@ final class IDVersionSegmentTermsEnumFrame {
// + ToStringUtils.bytesRefToString(suffixBytesRef)); // + ToStringUtils.bytesRefToString(suffixBytesRef));
// } // }
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix); suffixesReader.skipBytes(suffixLength);
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); // Compare suffix and target.
int targetPos = target.offset + prefix; final int cmp =
Arrays.compareUnsigned(
// Loop over bytes in the suffix, comparing to suffixBytes,
// the target startBytePos,
int bytePos = startBytePos; startBytePos + suffixLength,
while (true) { target.bytes,
final int cmp; target.offset + prefixLength,
final boolean stop; target.offset + target.length);
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) { if (cmp < 0) {
// Current entry is still before the target; // Current entry is still before the target;
// keep scanning // keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) { } else if (cmp > 0) {
// Done! Current entry is after target -- // Done! Current entry is after target --
// return NOT_FOUND: // return NOT_FOUND:
fillTerm(); fillTerm();
if (!exactOnly && !ste.termExists) {
// We are on a sub-block, and caller wants
// us to position to the next term after
// the target, so we must recurse into the
// sub-frame(s):
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen);
ste.currentFrame.loadBlock();
while (ste.currentFrame.next()) {
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length());
ste.currentFrame.loadBlock();
}
}
// if (DEBUG) System.out.println(" not found"); // if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} else if (stop) { } else {
// Exact match! // Exact match!
// This cannot be a sub-block because we // This cannot be a sub-block because we
// would have followed the index to this // would have followed the index to this
// sub-block from the start: // sub-block from the start:
assert ste.termExists;
fillTerm(); fillTerm();
// if (DEBUG) System.out.println(" found!"); // if (DEBUG) System.out.println(" found!");
return SeekStatus.FOUND; return SeekStatus.FOUND;
} }
} } while (nextEnt < entCount);
}
// It is possible (and OK) that terms index pointed us // It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and // at this block, but, we scanned the entire block and
@ -622,13 +586,11 @@ final class IDVersionSegmentTermsEnumFrame {
assert prefixMatches(target); assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block: // Loop over each entry (term or sub-block) in this block:
// nextTerm: while(nextEnt < entCount) { while (nextEnt < entCount) {
nextTerm:
while (true) {
nextEnt++; nextEnt++;
final int code = suffixesReader.readVInt(); final int code = suffixesReader.readVInt();
suffix = code >>> 1; suffixLength = code >>> 1;
// if (DEBUG) { // if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef(); // BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes; // suffixBytesRef.bytes = suffixBytes;
@ -640,9 +602,8 @@ final class IDVersionSegmentTermsEnumFrame {
// } // }
ste.termExists = (code & 1) == 0; ste.termExists = (code & 1) == 0;
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition(); startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix); suffixesReader.skipBytes(suffixLength);
if (ste.termExists) { if (ste.termExists) {
state.termBlockOrd++; state.termBlockOrd++;
subCode = 0; subCode = 0;
@ -651,40 +612,20 @@ final class IDVersionSegmentTermsEnumFrame {
lastSubFP = fp - subCode; lastSubFP = fp - subCode;
} }
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); // Compare suffix and target.
int targetPos = target.offset + prefix; final int cmp =
Arrays.compareUnsigned(
// Loop over bytes in the suffix, comparing to suffixBytes,
// the target startBytePos,
int bytePos = startBytePos; startBytePos + suffixLength,
while (true) { target.bytes,
final int cmp; target.offset + prefixLength,
final boolean stop; target.offset + target.length);
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) { if (cmp < 0) {
// Current entry is still before the target; // Current entry is still before the target;
// keep scanning // keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
// termExists = true;
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) { } else if (cmp > 0) {
// Done! Current entry is after target -- // Done! Current entry is after target --
// return NOT_FOUND: // return NOT_FOUND:
fillTerm(); fillTerm();
@ -694,7 +635,8 @@ final class IDVersionSegmentTermsEnumFrame {
// us to position to the next term after // us to position to the next term after
// the target, so we must recurse into the // the target, so we must recurse into the
// sub-frame(s): // sub-frame(s):
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); ste.currentFrame =
ste.pushFrame(null, ste.currentFrame.lastSubFP, prefixLength + suffixLength);
ste.currentFrame.loadBlock(); ste.currentFrame.loadBlock();
while (ste.currentFrame.next()) { while (ste.currentFrame.next()) {
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length());
@ -704,7 +646,7 @@ final class IDVersionSegmentTermsEnumFrame {
// if (DEBUG) System.out.println(" not found"); // if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} else if (stop) { } else {
// Exact match! // Exact match!
// This cannot be a sub-block because we // This cannot be a sub-block because we
@ -717,7 +659,6 @@ final class IDVersionSegmentTermsEnumFrame {
return SeekStatus.FOUND; return SeekStatus.FOUND;
} }
} }
}
// It is possible (and OK) that terms index pointed us // It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and // at this block, but, we scanned the entire block and
@ -740,9 +681,9 @@ final class IDVersionSegmentTermsEnumFrame {
} }
private void fillTerm() { private void fillTerm() {
final int termLength = prefix + suffix; final int termLength = prefixLength + suffixLength;
ste.term.setLength(prefix + suffix); ste.term.setLength(prefixLength + suffixLength);
ste.term.grow(termLength); ste.term.grow(termLength);
System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength);
} }
} }