LUCENE-6548: some optimizations to BlockTree's intersect method with very finite automata

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1687124 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-06-23 19:59:19 +00:00
parent 5164a48fd0
commit 6624bfc982
6 changed files with 422 additions and 399 deletions

View File

@ -191,6 +191,10 @@ Changes in Runtime Behavior
* LUCENE-2880: Span queries now score more consistently with regular queries.
(Robert Muir, Adrien Grand)
Optimizations
* LUCENE-6548: Some optimizations for BlockTree's intersect with very
finite automata (Mike McCandless)
Build

View File

@ -103,8 +103,12 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Auto-prefix terms. */
public static final int VERSION_AUTO_PREFIX_TERMS = 1;
/** Conditional auto-prefix terms: we record at write time whether
* this field did write any auto-prefix terms. */
public static final int VERSION_AUTO_PREFIX_TERMS_COND = 2;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS;
public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS_COND;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
@ -131,6 +135,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
final int version;
final boolean anyAutoPrefixTerms;
/** Sole constructor. */
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
boolean success = false;
@ -143,7 +149,26 @@ public final class BlockTreeTermsReader extends FieldsProducer {
try {
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
if (version < VERSION_AUTO_PREFIX_TERMS) {
// Old (pre-5.2.0) index, no auto-prefix terms:
this.anyAutoPrefixTerms = false;
} else if (version == VERSION_AUTO_PREFIX_TERMS) {
// 5.2.x index, might have auto-prefix terms:
this.anyAutoPrefixTerms = true;
} else {
// 5.3.x index, we record up front if we may have written any auto-prefix terms:
assert version >= VERSION_AUTO_PREFIX_TERMS_COND;
byte b = termsIn.readByte();
if (b == 0) {
this.anyAutoPrefixTerms = false;
} else if (b == 1) {
this.anyAutoPrefixTerms = true;
} else {
throw new CorruptIndexException("invalid anyAutoPrefixTerms: expected 0 or 1 but got " + b, termsIn);
}
}
String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexIn = state.directory.openInput(indexName, state.context);
CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);

View File

@ -124,7 +124,7 @@ import org.apache.lucene.util.packed.PackedInts;
* and decoding the Postings Metadata and Term Metadata sections.</p>
*
* <ul>
* <li>TermsDict (.tim) --&gt; Header, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
* <li>TermsDict (.tim) --&gt; Header, HasAutoPrefixTerms, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
* FieldSummary, DirOffset, Footer</li>
* <li>NodeBlock --&gt; (OuterNode | InnerNode)</li>
* <li>OuterNode --&gt; EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, &lt; TermStats &gt;<sup>EntryCount</sup>, MetaLength, &lt;<i>TermMetadata</i>&gt;<sup>EntryCount</sup></li>
@ -145,6 +145,7 @@ import org.apache.lucene.util.packed.PackedInts;
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the BlockTree implementation.</li>
* <li>HasAutoPrefixTerms is a single byte; 1 means there may be auto-prefix terms and 0 means there are none.
* <li>DirOffset is a pointer to the FieldSummary section.</li>
* <li>DocFreq is the count of documents which contain the term.</li>
* <li>TotalTermFreq is the total number of occurrences of the term. This is encoded
@ -331,6 +332,13 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
// So at read time we know, globally, that there will be no auto-prefix terms:
if (minItemsInAutoPrefix == 0) {
termsOut.writeByte((byte) 0);
} else {
termsOut.writeByte((byte) 1);
}
final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
@ -891,27 +899,34 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// if entry is term or sub-block, and 1 bit to record if
// it's a prefix term. Terms cannot be larger than ~32 KB
// so we won't run out of bits:
code = suffix<<2;
int floorLeadEnd = -1;
if (term.prefixTerm != null) {
sawAutoPrefixTerm = true;
PrefixTerm prefixTerm = term.prefixTerm;
floorLeadEnd = prefixTerm.floorLeadEnd;
assert floorLeadEnd != -1;
if (prefixTerm.floorLeadStart == -2) {
// Starts with empty string
code |= 2;
} else {
code |= 3;
if (minItemsInAutoPrefix == 0) {
suffixWriter.writeVInt(suffix << 1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
} else {
code = suffix<<2;
int floorLeadEnd = -1;
if (term.prefixTerm != null) {
assert minItemsInAutoPrefix > 0;
sawAutoPrefixTerm = true;
PrefixTerm prefixTerm = term.prefixTerm;
floorLeadEnd = prefixTerm.floorLeadEnd;
assert floorLeadEnd != -1;
if (prefixTerm.floorLeadStart == -2) {
// Starts with empty string
code |= 2;
} else {
code |= 3;
}
}
suffixWriter.writeVInt(code);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
if (floorLeadEnd != -1) {
suffixWriter.writeByte((byte) floorLeadEnd);
}
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
}
suffixWriter.writeVInt(code);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
if (floorLeadEnd != -1) {
suffixWriter.writeByte((byte) floorLeadEnd);
}
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
@ -948,7 +963,11 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block, and 1 bit (unset here) to
// record if it's a prefix term:
suffixWriter.writeVInt((suffix<<2)|1);
if (minItemsInAutoPrefix == 0) {
suffixWriter.writeVInt((suffix<<1)|1);
} else {
suffixWriter.writeVInt((suffix<<2)|1);
}
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
//if (DEBUG2) {

View File

@ -63,6 +63,7 @@ final class IntersectTermsEnum extends TermsEnum {
final BytesRef commonSuffix;
private IntersectTermsEnumFrame currentFrame;
private Transition currentTransition;
private final BytesRef term = new BytesRef();
@ -83,14 +84,12 @@ final class IntersectTermsEnum extends TermsEnum {
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm, int sinkState) throws IOException {
//if (DEBUG) System.out.println("\nintEnum.init seg=" + fr.parent.segment + " commonSuffix=" + commonSuffix);
this.fr = fr;
this.sinkState = sinkState;
assert automaton != null;
assert runAutomaton != null;
//if (DEBUG) System.out.println("sinkState=" + sinkState + " AUTOMATON:\n" + automaton.toDot());
this.runAutomaton = runAutomaton;
this.allowAutoPrefixTerms = sinkState != -1;
this.automaton = automaton;
@ -138,6 +137,7 @@ final class IntersectTermsEnum extends TermsEnum {
if (startTerm != null) {
seekToStartTerm(startTerm);
}
currentTransition = currentFrame.transition;
}
// only for assert:
@ -179,11 +179,12 @@ final class IntersectTermsEnum extends TermsEnum {
}
private IntersectTermsEnumFrame pushFrame(int state) throws IOException {
assert currentFrame != null;
final IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord);
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
//if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
@ -219,9 +220,7 @@ final class IntersectTermsEnum extends TermsEnum {
@Override
public int docFreq() throws IOException {
//if (DEBUG) System.out.println("BTIR.docFreq");
currentFrame.decodeMetaData();
//if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq);
return currentFrame.termState.docFreq;
}
@ -251,7 +250,6 @@ final class IntersectTermsEnum extends TermsEnum {
// arbitrary seekExact/Ceil. Note that this is a
// seekFloor!
private void seekToStartTerm(BytesRef target) throws IOException {
//if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString() + " length=" + target.length);
assert currentFrame.ord == 0;
if (term.length < target.length) {
term.bytes = ArrayUtil.grow(term.bytes, target.length);
@ -260,7 +258,6 @@ final class IntersectTermsEnum extends TermsEnum {
assert arc == currentFrame.arc;
for(int idx=0;idx<=target.length;idx++) {
//if (DEBUG) System.out.println("cycle idx=" + idx);
while (true) {
final int savNextEnt = currentFrame.nextEnt;
@ -271,8 +268,6 @@ final class IntersectTermsEnum extends TermsEnum {
final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;
//if (DEBUG) System.out.println(" cycle isAutoPrefix=" + saveIsAutoPrefixTerm + " ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
final boolean isSubBlock = currentFrame.next();
term.length = currentFrame.prefix + currentFrame.suffix;
@ -281,25 +276,19 @@ final class IntersectTermsEnum extends TermsEnum {
}
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
//if (DEBUG) System.out.println(" isSubBlock=" + isSubBlock + " term/prefix=" + brToString(term) + " saveIsAutoPrefixTerm=" + saveIsAutoPrefixTerm + " allowAutoPrefixTerms=" + allowAutoPrefixTerms);
if (isSubBlock && StringHelper.startsWith(target, term)) {
// Recurse
//if (DEBUG) System.out.println(" recurse!");
currentFrame = pushFrame(getState());
break;
} else {
final int cmp = term.compareTo(target);
//if (DEBUG) System.out.println(" cmp=" + cmp);
if (cmp < 0) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
// Advance to next floor block
//if (DEBUG) System.out.println(" load floorBlock");
currentFrame.loadNextFloorBlock();
continue;
} else {
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
}
}
@ -308,14 +297,12 @@ final class IntersectTermsEnum extends TermsEnum {
if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
continue;
}
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
} else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
// Fallback to prior entry: the semantics of
// this method is that the first call to
// next() will return the term after the
// requested term
//if (DEBUG) System.out.println(" fallback prior entry");
currentFrame.nextEnt = savNextEnt;
currentFrame.lastSubFP = saveLastSubFP;
currentFrame.startBytePos = saveStartBytePos;
@ -338,321 +325,348 @@ final class IntersectTermsEnum extends TermsEnum {
assert false;
}
private boolean popPushNext() throws IOException {
// Pop finished frames
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
// Advance to next floor block
currentFrame.loadNextFloorBlock();
break;
} else {
if (currentFrame.ord == 0) {
throw NoMoreTermsException.INSTANCE;
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
assert currentFrame.lastSubFP == lastFP;
}
}
return currentFrame.next();
}
private boolean skipPastLastAutoPrefixTerm() throws IOException {
assert currentFrame.isAutoPrefixTerm;
useAutoPrefixTerm = false;
currentFrame.termState.isRealTerm = true;
// If we last returned an auto-prefix term, we must now skip all
// actual terms sharing that prefix. At most, that skipping
// requires popping one frame, but it can also require simply
// scanning ahead within the current frame. This scanning will
// skip sub-blocks that contain many terms, which is why the
// optimization "works":
int floorSuffixLeadEnd = currentFrame.floorSuffixLeadEnd;
boolean isSubBlock;
if (floorSuffixLeadEnd == -1) {
// An ordinary prefix, e.g. foo*
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
if (suffix == 0) {
// Easy case: the prefix term's suffix is the empty string,
// meaning the prefix corresponds to all terms in the
// current block, so we just pop this entire block:
if (currentFrame.ord == 0) {
throw NoMoreTermsException.INSTANCE;
}
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
return popPushNext();
} else {
// Just next() until we hit an entry that doesn't share this
// prefix. The first next should be a sub-block sharing the
// same prefix, because if there are enough terms matching a
// given prefix to warrant an auto-prefix term, then there
// must also be enough to make a sub-block (assuming
// minItemsInPrefix > minItemsInBlock):
scanPrefix:
while (true) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
throw NoMoreTermsException.INSTANCE;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
return popPushNext();
}
}
isSubBlock = currentFrame.next();
for(int i=0;i<suffix;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
break scanPrefix;
}
}
}
}
} else {
// Floor'd auto-prefix term; in this case we must skip all
// terms e.g. matching foo[a-m]*. We are currently "on" fooa,
// which the automaton accepted (fooa* through foom*), and
// floorSuffixLeadEnd is m, so we must now scan to foon:
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
if (currentFrame.floorSuffixLeadStart == -1) {
suffix++;
}
if (suffix == 0) {
// This means current frame is fooa*, so we have to first
// pop the current frame, then scan in parent frame:
if (currentFrame.ord == 0) {
throw NoMoreTermsException.INSTANCE;
}
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
// Current (parent) frame is now foo*, so now we just scan
// until the lead suffix byte is > floorSuffixLeadEnd
//assert currentFrame.prefix == prefix-1;
//prefix = currentFrame.prefix;
// In case when we pop, and the parent block is not just prefix-1, e.g. in block 417* on
// its first term = floor prefix term 41[7-9], popping to block 4*:
prefix = currentFrame.prefix;
suffix = term.length - currentFrame.prefix;
} else {
// No need to pop; just scan in currentFrame:
}
// Now we scan until the lead suffix byte is > floorSuffixLeadEnd
scanFloor:
while (true) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
throw NoMoreTermsException.INSTANCE;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
return popPushNext();
}
}
isSubBlock = currentFrame.next();
for(int i=0;i<suffix-1;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
break scanFloor;
}
}
if (currentFrame.suffix >= suffix && (currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]&0xff) > floorSuffixLeadEnd) {
// Done scanning: we are now on the first term after all
// terms matched by this auto-prefix term
break;
}
}
}
return isSubBlock;
}
// Only used internally when there are no more terms in next():
private static final class NoMoreTermsException extends RuntimeException {
// Only used internally when there are no more terms in next():
public static final NoMoreTermsException INSTANCE = new NoMoreTermsException();
private NoMoreTermsException() {
}
@Override
public Throwable fillInStackTrace() {
// Do nothing:
return this;
}
}
@Override
public BytesRef next() throws IOException {
try {
return _next();
} catch (NoMoreTermsException eoi) {
// Provoke NPE if we are (illegally!) called again:
currentFrame = null;
return null;
}
}
//if (DEBUG) {
// System.out.println("\nintEnum.next seg=" + fr.parent.segment);
// System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix + " trans: " + currentFrame.transition + " useAutoPrefix=" + useAutoPrefixTerm);
//}
private BytesRef _next() throws IOException {
boolean isSubBlock;
if (useAutoPrefixTerm) {
// If the current term was an auto-prefix term, we have to skip past it:
isSubBlock = skipPastLastAutoPrefixTerm();
assert useAutoPrefixTerm == false;
} else {
isSubBlock = popPushNext();
}
nextTerm:
while (true) {
assert currentFrame.transition == currentTransition;
boolean isSubBlock;
if (useAutoPrefixTerm) {
assert currentFrame.isAutoPrefixTerm;
useAutoPrefixTerm = false;
currentFrame.termState.isRealTerm = true;
//if (DEBUG) {
// System.out.println(" now scan beyond auto-prefix term=" + brToString(term) + " floorSuffixLeadEnd=" + Integer.toHexString(currentFrame.floorSuffixLeadEnd));
//}
// If we last returned an auto-prefix term, we must now skip all
// actual terms sharing that prefix. At most, that skipping
// requires popping one frame, but it can also require simply
// scanning ahead within the current frame. This scanning will
// skip sub-blocks that contain many terms, which is why the
// optimization "works":
int floorSuffixLeadEnd = currentFrame.floorSuffixLeadEnd;
if (floorSuffixLeadEnd == -1) {
// An ordinary prefix, e.g. foo*
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
//if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
if (suffix == 0) {
//if (DEBUG) System.out.println(" pop frame & nextTerm");
// Easy case: the prefix term's suffix is the empty string,
// meaning the prefix corresponds to all terms in the
// current block, so we just pop this entire block:
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
} else {
// Just next() until we hit an entry that doesn't share this
// prefix. The first next should be a sub-block sharing the
// same prefix, because if there are enough terms matching a
// given prefix to warrant an auto-prefix term, then there
// must also be enough to make a sub-block (assuming
// minItemsInPrefix > minItemsInBlock):
scanPrefix:
while (true) {
//if (DEBUG) System.out.println(" scan next");
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null0");
return null;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
//if (DEBUG) System.out.println(" pop; nextTerm");
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
}
}
isSubBlock = currentFrame.next();
//if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
// System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
// suffixBytes.length = currentFrame.suffix;
// System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
//}
for(int i=0;i<suffix;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
//if (DEBUG) System.out.println(" done; now stop scan");
break scanPrefix;
}
}
}
}
} else {
// Floor'd auto-prefix term; in this case we must skip all
// terms e.g. matching foo[a-m]*. We are currently "on" fooa,
// which the automaton accepted (fooa* through foom*), and
// floorSuffixLeadEnd is m, so we must now scan to foon:
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
if (currentFrame.floorSuffixLeadStart == -1) {
suffix++;
}
//if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
if (suffix == 0) {
//if (DEBUG) System.out.println(" pop frame");
// This means current frame is fooa*, so we have to first
// pop the current frame, then scan in parent frame:
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
// Current (parent) frame is now foo*, so now we just scan
// until the lead suffix byte is > floorSuffixLeadEnd
//assert currentFrame.prefix == prefix-1;
//prefix = currentFrame.prefix;
// In case when we pop, and the parent block is not just prefix-1, e.g. in block 417* on
// its first term = floor prefix term 41[7-9], popping to block 4*:
prefix = currentFrame.prefix;
suffix = term.length - currentFrame.prefix;
} else {
// No need to pop; just scan in currentFrame:
}
//if (DEBUG) System.out.println(" start scan: prefix=" + prefix + " suffix=" + suffix);
// Now we scan until the lead suffix byte is > floorSuffixLeadEnd
scanFloor:
while (true) {
//if (DEBUG) System.out.println(" scan next");
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
//if (DEBUG) System.out.println(" next floor block");
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
currentFrame = stack[currentFrame.ord-1];
//if (DEBUG) System.out.println(" pop, now curFrame.prefix=" + currentFrame.prefix);
continue nextTerm;
}
}
isSubBlock = currentFrame.next();
//if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
// System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
// suffixBytes.length = currentFrame.suffix;
// System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
//}
for(int i=0;i<suffix-1;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
//if (DEBUG) System.out.println(" done; now stop scan");
break scanFloor;
}
}
//if (DEBUG) {
// if (currentFrame.suffix >= suffix) {
// System.out.println(" cmp label=" + Integer.toHexString(currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]) + " vs " + floorSuffixLeadEnd);
// }
//}
if (currentFrame.suffix >= suffix && (currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]&0xff) > floorSuffixLeadEnd) {
// Done scanning: we are now on the first term after all
// terms matched by this auto-prefix term
//if (DEBUG) System.out.println(" done; now stop scan");
break;
}
}
}
} else {
// Pop finished frames
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
//if (DEBUG) System.out.println(" next-floor-block: trans: " + currentFrame.transition);
// Advance to next floor block
currentFrame.loadNextFloorBlock();
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
break;
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
assert currentFrame.lastSubFP == lastFP;
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
}
}
isSubBlock = currentFrame.next();
}
//if (DEBUG) {
// final BytesRef suffixRef = new BytesRef();
// suffixRef.bytes = currentFrame.suffixBytes;
// suffixRef.offset = currentFrame.startBytePos;
// suffixRef.length = currentFrame.suffix;
// System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
//}
int state;
int lastState;
// NOTE: suffix == 0 can only happen on the first term in a block, when
// there is a term exactly matching a prefix in the index. If we
// could somehow re-org the code so we only checked this case immediately
// after pushing a frame...
if (currentFrame.suffix != 0) {
// Advance where we are in the automaton to match what terms
// dict next'd to:
final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
//if (DEBUG) {
// System.out.println(" move automaton to label=" + label + " vs curMax=" + currentFrame.curTransitionMax);
// }
while (label > currentFrame.curTransitionMax) {
if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
// Pop this frame: no further matches are possible because
// we've moved beyond what the max transition will allow
//if (DEBUG) System.out.println(" break: trans");
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
}
currentFrame.transitionIndex++;
automaton.getNextTransition(currentFrame.transition);
currentFrame.curTransitionMax = currentFrame.transition.max;
//if (DEBUG) System.out.println(" next trans");
}
}
// First test the common suffix, if set:
if (commonSuffix != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
if (termLen < commonSuffix.length) {
// No match
//if (DEBUG) System.out.println(" skip: common suffix length");
continue nextTerm;
}
final byte[] suffixBytes = currentFrame.suffixBytes;
final byte[] commonSuffixBytes = commonSuffix.bytes;
final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
assert commonSuffix.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
// This is the first byte of the suffix of the term we are now on:
final int label = suffixBytes[currentFrame.startBytePos] & 0xff;
if (lenInPrefix > 0) {
// A prefix of the common suffix overlaps with
// the suffix of the block prefix so we first
// test whether the prefix part matches:
final byte[] termBytes = term.bytes;
int termBytesPos = currentFrame.prefix - lenInPrefix;
assert termBytesPos >= 0;
final int termBytesPosEnd = currentFrame.prefix;
while (termBytesPos < termBytesPosEnd) {
if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
//if (DEBUG) System.out.println(" skip: common suffix mismatch (in prefix)");
if (label < currentTransition.min) {
// Common case: we are scanning terms in this block to "catch up" to
// current transition in the automaton:
int minTrans = currentTransition.min;
while (currentFrame.nextEnt < currentFrame.entCount) {
isSubBlock = currentFrame.next();
if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
continue nextTerm;
}
}
suffixBytesPos = currentFrame.startBytePos;
} else {
suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
// End of frame:
isSubBlock = popPushNext();
continue nextTerm;
}
// Test overlapping suffix part:
final int commonSuffixBytesPosEnd = commonSuffix.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
//if (DEBUG) System.out.println(" skip: common suffix mismatch");
// Advance where we are in the automaton to match this label:
while (label > currentTransition.max) {
if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
// Pop this frame: no further matches are possible because
// we've moved beyond what the max transition will allow
if (currentFrame.ord == 0) {
// Provoke NPE if we are (illegally!) called again:
currentFrame = null;
return null;
}
currentFrame = stack[currentFrame.ord-1];
currentTransition = currentFrame.transition;
isSubBlock = popPushNext();
continue nextTerm;
}
currentFrame.transitionIndex++;
automaton.getNextTransition(currentTransition);
if (label < currentTransition.min) {
int minTrans = currentTransition.min;
while (currentFrame.nextEnt < currentFrame.entCount) {
isSubBlock = currentFrame.next();
if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
continue nextTerm;
}
}
// End of frame:
isSubBlock = popPushNext();
continue nextTerm;
}
}
}
// TODO: maybe we should do the same linear test
// that AutomatonTermsEnum does, so that if we
// reach a part of the automaton where .* is
// "temporarily" accepted, we just blindly .next()
// until the limit
if (commonSuffix != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
if (termLen < commonSuffix.length) {
// No match
isSubBlock = popPushNext();
continue nextTerm;
}
// TODO: for first iter of this loop can't we just use the current trans? we already advanced it and confirmed it matches lead
// byte of the suffix
final byte[] commonSuffixBytes = commonSuffix.bytes;
// See if the term suffix matches the automaton:
int state = currentFrame.state;
int lastState = currentFrame.lastState;
//if (DEBUG) {
// System.out.println(" a state=" + state + " curFrame.suffix.len=" + currentFrame.suffix + " curFrame.prefix=" + currentFrame.prefix);
// }
for (int idx=0;idx<currentFrame.suffix;idx++) {
lastState = state;
//if (DEBUG) System.out.println(" step label=" + (char) (currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff));
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
if (state == -1) {
// No match
//System.out.println(" no s=" + state);
continue nextTerm;
} else {
//System.out.println(" c s=" + state);
final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
assert commonSuffix.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
if (lenInPrefix > 0) {
// A prefix of the common suffix overlaps with
// the suffix of the block prefix so we first
// test whether the prefix part matches:
final byte[] termBytes = term.bytes;
int termBytesPos = currentFrame.prefix - lenInPrefix;
assert termBytesPos >= 0;
final int termBytesPosEnd = currentFrame.prefix;
while (termBytesPos < termBytesPosEnd) {
if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
isSubBlock = popPushNext();
continue nextTerm;
}
}
suffixBytesPos = currentFrame.startBytePos;
} else {
suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
}
// Test overlapping suffix part:
final int commonSuffixBytesPosEnd = commonSuffix.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
isSubBlock = popPushNext();
continue nextTerm;
}
}
}
}
//if (DEBUG) System.out.println(" after suffix: state=" + state + " lastState=" + lastState);
// TODO: maybe we should do the same linear test
// that AutomatonTermsEnum does, so that if we
// reach a part of the automaton where .* is
// "temporarily" accepted, we just blindly .next()
// until the limit
// See if the term suffix matches the automaton:
// We know from above that the first byte in our suffix (label) matches
// the current transition, so we step from the 2nd byte
// in the suffix:
lastState = currentFrame.state;
state = currentTransition.dest;
int end = currentFrame.startBytePos + currentFrame.suffix;
for (int idx=currentFrame.startBytePos+1;idx<end;idx++) {
lastState = state;
state = runAutomaton.step(state, suffixBytes[idx] & 0xff);
if (state == -1) {
// No match
isSubBlock = popPushNext();
continue nextTerm;
}
}
} else {
state = currentFrame.state;
lastState = currentFrame.lastState;
}
if (isSubBlock) {
// Match! Recurse:
//if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP);
copyTerm();
currentFrame = pushFrame(state);
currentTransition = currentFrame.transition;
currentFrame.lastState = lastState;
//xif (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
} else if (currentFrame.isAutoPrefixTerm) {
// We are on an auto-prefix term, meaning this term was compiled
// at indexing time, matching all terms sharing this prefix (or,
@ -671,7 +685,6 @@ final class IntersectTermsEnum extends TermsEnum {
if (currentFrame.floorSuffixLeadStart == -1) {
// Must also accept the empty string in this case
if (automaton.isAccept(state)) {
//if (DEBUG) System.out.println(" state is accept");
useAutoPrefixTerm = acceptsSuffixRange(state, 0, currentFrame.floorSuffixLeadEnd);
}
} else {
@ -679,12 +692,10 @@ final class IntersectTermsEnum extends TermsEnum {
}
}
//if (DEBUG) System.out.println(" useAutoPrefixTerm=" + useAutoPrefixTerm);
if (useAutoPrefixTerm) {
// All suffixes of this auto-prefix term are accepted by the automaton, so we can use it:
copyTerm();
currentFrame.termState.isRealTerm = false;
//if (DEBUG) System.out.println(" return auto prefix term: " + brToString(term));
return term;
} else {
// We move onto the next term
@ -694,31 +705,27 @@ final class IntersectTermsEnum extends TermsEnum {
}
} else if (runAutomaton.isAccept(state)) {
copyTerm();
//if (DEBUG) System.out.println(" term match to state=" + state);
assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return term;
} else {
//System.out.println(" no s=" + state);
// This term is a prefix of a term accepted by the automaton, but is not itself acceptd
}
isSubBlock = popPushNext();
}
}
private final Transition transition = new Transition();
private final Transition scratchTransition = new Transition();
/** Returns true if, from this state, the automaton accepts any suffix
* starting with a label between start and end, inclusive. We just
* look for a transition, matching this range, to the sink state. */
private boolean acceptsSuffixRange(int state, int start, int end) {
//xif (DEBUG) System.out.println(" acceptsSuffixRange state=" + state + " start=" + start + " end=" + end);
int count = automaton.initTransition(state, transition);
//xif (DEBUG) System.out.println(" transCount=" + count);
//xif (DEBUG) System.out.println(" trans=" + transition);
int count = automaton.initTransition(state, scratchTransition);
for(int i=0;i<count;i++) {
automaton.getNextTransition(transition);
if (start >= transition.min && end <= transition.max && transition.dest == sinkState) {
automaton.getNextTransition(scratchTransition);
if (start >= scratchTransition.min && end <= scratchTransition.max && scratchTransition.dest == sinkState) {
return true;
}
}
@ -740,7 +747,6 @@ final class IntersectTermsEnum extends TermsEnum {
}
private void copyTerm() {
//System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]);
final int len = currentFrame.prefix + currentFrame.suffix;
if (term.bytes.length < len) {
term.bytes = ArrayUtil.grow(term.bytes, len);

View File

@ -73,8 +73,7 @@ final class IntersectTermsEnumFrame {
int numFollowFloorBlocks;
int nextFloorLabel;
Transition transition = new Transition();
int curTransitionMax;
final Transition transition = new Transition();
int transitionIndex;
int transitionCount;
@ -85,10 +84,12 @@ final class IntersectTermsEnumFrame {
final BlockTermState termState;
// metadata buffer, holding monotonic values
public long[] longs;
final long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
byte[] bytes = new byte[32];
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Cumulative output so far
BytesRef outputPrefix;
@ -115,27 +116,22 @@ final class IntersectTermsEnumFrame {
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.longs = new long[ite.fr.longsSize];
this.versionAutoPrefix = ite.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
this.versionAutoPrefix = ite.fr.parent.anyAutoPrefixTerms;
}
void loadNextFloorBlock() throws IOException {
assert numFollowFloorBlocks > 0: "nextFloorLabel=" + nextFloorLabel;
//if (DEBUG) System.out.println(" loadNextFloorBlock transition.min=" + transition.min);
do {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
//if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
//if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
//if (DEBUG) System.out.println(" done loadNextFloorBlock");
load(null);
}
@ -146,31 +142,26 @@ final class IntersectTermsEnumFrame {
if (transitionCount != 0) {
ite.automaton.initTransition(state, transition);
ite.automaton.getNextTransition(transition);
curTransitionMax = transition.max;
//if (DEBUG) System.out.println(" after setState state=" + state + " trans: " + transition + " transCount=" + transitionCount);
} else {
curTransitionMax = -1;
// Must set min to -1 so the "label < min" check never falsely triggers:
transition.min = -1;
// Must set max to -1 so we immediately realize we need to step to the next transition and then pop this frame:
transition.max = -1;
}
}
void load(BytesRef frameIndexData) throws IOException {
//xif (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
if (frameIndexData != null) {
// Floor frame
if (floorData.length < frameIndexData.length) {
this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)];
}
System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length);
floorDataReader.reset(floorData, 0, frameIndexData.length);
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
if ((code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
//if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
// If current state is not accept, and has transitions, we must process
// first block in case it has empty suffix:
@ -180,7 +171,6 @@ final class IntersectTermsEnumFrame {
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
//xif (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
@ -201,7 +191,6 @@ final class IntersectTermsEnumFrame {
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
//if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
@ -222,10 +211,7 @@ final class IntersectTermsEnumFrame {
// metadata
numBytes = ite.in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(bytes, 0, numBytes);
@ -255,9 +241,6 @@ final class IntersectTermsEnumFrame {
}
public void nextLeaf() {
//if (DEBUG) {
// System.out.println(" frame.nextLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
//}
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
@ -266,9 +249,6 @@ final class IntersectTermsEnumFrame {
}
public boolean nextNonLeaf() {
//if (DEBUG) {
// System.out.println(" frame.nextNonLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " versionAutoPrefix=" + versionAutoPrefix + " fp=" + suffixesReader.getPosition());
// }
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
@ -292,7 +272,6 @@ final class IntersectTermsEnumFrame {
switch (code & 3) {
case 0:
// A normal term
//if (DEBUG) System.out.println(" ret: term");
isAutoPrefixTerm = false;
termState.termBlockOrd++;
return false;
@ -300,7 +279,6 @@ final class IntersectTermsEnumFrame {
// A sub-block; make sub-FP absolute:
isAutoPrefixTerm = false;
lastSubFP = fp - suffixesReader.readVLong();
//if (DEBUG) System.out.println(" ret: sub-block");
return true;
case 2:
// A normal prefix term, suffix leads with empty string
@ -309,9 +287,7 @@ final class IntersectTermsEnumFrame {
floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
if (floorSuffixLeadEnd == 0xff) {
floorSuffixLeadEnd = -1;
//System.out.println(" fill in -1");
}
//if (DEBUG) System.out.println(" ret: floor prefix term: start=-1 end=" + floorSuffixLeadEnd);
isAutoPrefixTerm = true;
return false;
case 3:
@ -322,14 +298,12 @@ final class IntersectTermsEnumFrame {
assert ord > 0;
IntersectTermsEnumFrame parent = ite.stack[ord-1];
floorSuffixLeadStart = parent.suffixBytes[parent.startBytePos+parent.suffix-1] & 0xff;
//if (DEBUG) System.out.println(" peek-parent: suffix=" + floorSuffixLeadStart);
} else {
floorSuffixLeadStart = suffixBytes[startBytePos+suffix-1] & 0xff;
}
termState.termBlockOrd++;
isAutoPrefixTerm = true;
floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
//if (DEBUG) System.out.println(" ret: floor prefix term start=" + floorSuffixLeadStart + " end=" + floorSuffixLeadEnd);
return false;
default:
// Silly javac:
@ -364,10 +338,8 @@ final class IntersectTermsEnumFrame {
// stats
termState.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ite.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < ite.fr.longsSize; i++) {

View File

@ -87,10 +87,10 @@ final class SegmentTermsEnumFrame {
final BlockTermState state;
// metadata buffer, holding monotonic values
public long[] longs;
final long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
byte[] bytes = new byte[32];
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
private final SegmentTermsEnum ste;
@ -100,7 +100,7 @@ final class SegmentTermsEnumFrame {
this.state = ste.fr.parent.postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.longs = new long[ste.fr.longsSize];
this.versionAutoPrefix = ste.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
this.versionAutoPrefix = ste.fr.parent.anyAutoPrefixTerms;
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@ -201,16 +201,12 @@ final class SegmentTermsEnumFrame {
// that's rare so won't help much
// metadata
numBytes = ste.in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
fpEnd = ste.in.getFilePointer();
@ -308,14 +304,10 @@ final class SegmentTermsEnumFrame {
final int code = suffixesReader.readVInt();
if (versionAutoPrefix == false) {
suffix = code >>> 1;
} else {
suffix = code >>> 2;
}
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
if (versionAutoPrefix == false) {
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
if ((code & 1) == 0) {
// A normal term
ste.termExists = true;
@ -333,6 +325,11 @@ final class SegmentTermsEnumFrame {
return true;
}
} else {
suffix = code >>> 2;
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
switch(code & 3) {
case 0: