mirror of https://github.com/apache/lucene.git
LUCENE-5675: add testRandom; sometimes fails
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1595229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
83332c046b
commit
fa51d5972a
|
@ -45,6 +45,8 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
// Lazy init:
|
||||
IndexInput in;
|
||||
|
||||
private static boolean DEBUG = true;
|
||||
|
||||
private IDVersionSegmentTermsEnumFrame[] stack;
|
||||
private final IDVersionSegmentTermsEnumFrame staticFrame;
|
||||
IDVersionSegmentTermsEnumFrame currentFrame;
|
||||
|
@ -214,13 +216,27 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
return seekExact(target, 0);
|
||||
}
|
||||
|
||||
// for debugging
|
||||
@SuppressWarnings("unused")
|
||||
private String brToString(BytesRef b) {
|
||||
try {
|
||||
return b.utf8ToString() + " " + b;
|
||||
} catch (Throwable t) {
|
||||
// If BytesRef isn't actually UTF8, or it's eg a
|
||||
// prefix of UTF8 that ends mid-unicode-char, we
|
||||
// fallback to hex:
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns false if the term deos not exist, or it exists but its version is < minIDVersion. */
|
||||
public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException {
|
||||
|
||||
if (fr.index == null) {
|
||||
throw new IllegalStateException("terms index was not loaded");
|
||||
}
|
||||
System.out.println("seekExact target=" + target + " minIDVersion=" + minIDVersion);
|
||||
|
||||
// nocommit would be nice if somehow on doing deletes we didn't have to double-lookup again...
|
||||
|
||||
if (term.bytes.length <= target.length) {
|
||||
term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
|
||||
|
@ -228,10 +244,10 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
|
||||
assert clearEOF();
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
|
||||
// printSeekState();
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " minIDVersion=" + minIDVersion + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
|
||||
printSeekState(System.out);
|
||||
}
|
||||
|
||||
FST.Arc<Pair<BytesRef,Long>> arc;
|
||||
int targetUpto;
|
||||
|
@ -239,6 +255,8 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
|
||||
targetBeforeCurrentLength = currentFrame.ord;
|
||||
|
||||
// nocommit we could stop earlier w/ the version check, every time we traverse an index arc we can check?
|
||||
|
||||
if (currentFrame != staticFrame) {
|
||||
|
||||
// We are already seek'd; find the common
|
||||
|
@ -248,9 +266,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
// seeks to foobaz, we can re-use the seek state
|
||||
// for the first 5 bytes.
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
|
||||
}
|
||||
|
||||
arc = arcs[0];
|
||||
assert arc.isFinal();
|
||||
|
@ -258,7 +276,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
targetUpto = 0;
|
||||
|
||||
IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
|
||||
assert validIndexPrefix <= term.length;
|
||||
assert validIndexPrefix <= term.length: "validIndexPrefix=" + validIndexPrefix + " term.length=" + term.length + " seg=" + fr.parent.segment;
|
||||
|
||||
final int targetLimit = Math.min(target.length, validIndexPrefix);
|
||||
|
||||
|
@ -270,9 +288,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
// First compare up to valid seek frames:
|
||||
while (targetUpto < targetLimit) {
|
||||
cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
|
||||
}
|
||||
if (cmp != 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -300,9 +318,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
final int targetLimit2 = Math.min(target.length, term.length);
|
||||
while (targetUpto < targetLimit2) {
|
||||
cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
|
||||
}
|
||||
if (cmp != 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -319,9 +337,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
// Common case: target term is after current
|
||||
// term, ie, app is seeking multiple terms
|
||||
// in sorted order
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord + "; targetUpto=" + targetUpto);
|
||||
}
|
||||
currentFrame = lastFrame;
|
||||
|
||||
} else if (cmp > 0) {
|
||||
|
@ -330,23 +348,41 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
// keep the currentFrame but we must rewind it
|
||||
// (so we scan from the start)
|
||||
targetBeforeCurrentLength = 0;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
|
||||
}
|
||||
currentFrame = lastFrame;
|
||||
currentFrame.rewind();
|
||||
} else {
|
||||
// Target is exactly the same as current term
|
||||
assert term.length == target.length;
|
||||
if (termExists) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" target is same as current; return true");
|
||||
// }
|
||||
|
||||
if (currentFrame.maxIDVersion < minIDVersion) {
|
||||
// The max version for all terms in this block is lower than the minVersion
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is same as current maxIDVersion=" + currentFrame.maxIDVersion + " is < minIDVersion=" + minIDVersion + "; return false");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
currentFrame.decodeMetaData();
|
||||
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
|
||||
// The max version for this term is lower than the minVersion
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is same as current but version=" + ((IDVersionTermState) currentFrame.state).idVersion + " is < minIDVersion=" + minIDVersion + "; return false");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is same as current; return true");
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" target is same as current but term doesn't exist");
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" target is same as current but term doesn't exist");
|
||||
}
|
||||
}
|
||||
//validIndexPrefix = currentFrame.depth;
|
||||
//term.length = target.length;
|
||||
|
@ -357,15 +393,15 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
|
||||
targetBeforeCurrentLength = -1;
|
||||
arc = fr.index.getFirstArc(arcs[0]);
|
||||
System.out.println("first arc=" + arc);
|
||||
//System.out.println("first arc=" + arc);
|
||||
|
||||
// Empty string prefix must have an output (block) in the index!
|
||||
assert arc.isFinal();
|
||||
assert arc.output != null;
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" no seek state; push root frame");
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" no seek state; push root frame");
|
||||
}
|
||||
|
||||
output = arc.output;
|
||||
|
||||
|
@ -376,9 +412,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
|
||||
}
|
||||
|
||||
while (targetUpto < target.length) {
|
||||
|
||||
|
@ -389,9 +425,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
if (nextArc == null) {
|
||||
|
||||
// Index is exhausted
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + Integer.toHexString(targetLabel));
|
||||
}
|
||||
|
||||
validIndexPrefix = currentFrame.prefix;
|
||||
//validIndexPrefix = targetUpto;
|
||||
|
@ -402,15 +438,21 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
termExists = false;
|
||||
term.bytes[targetUpto] = (byte) targetLabel;
|
||||
term.length = 1+targetUpto;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" FAST NOT_FOUND term=" + brToString(term));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
System.out.println(" check output=" +((output.output2)));
|
||||
//System.out.println(" check maxVersion=" + currentFrame.maxIDVersion + " vs " + minIDVersion);
|
||||
|
||||
if (currentFrame.maxIDVersion < minIDVersion) {
|
||||
// The max version for all terms in this block is lower than the minVersion
|
||||
//termExists = false;
|
||||
//term.bytes[targetUpto] = (byte) targetLabel;
|
||||
//term.length = 1+targetUpto;
|
||||
if (DEBUG) {
|
||||
System.out.println(" FAST version NOT_FOUND term=" + brToString(term) + " currentFrame.maxIDVersion=" + currentFrame.maxIDVersion + " validIndexPrefix=" + validIndexPrefix);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -418,20 +460,24 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
|
||||
final SeekStatus result = currentFrame.scanToTerm(target, true);
|
||||
if (result == SeekStatus.FOUND) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
|
||||
// }
|
||||
|
||||
currentFrame.decodeMetaData();
|
||||
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
|
||||
// The max version for this term is lower than the minVersion
|
||||
if (DEBUG) {
|
||||
System.out.println(" return NOT_FOUND: idVersion=" + ((IDVersionTermState) currentFrame.state).idVersion + " vs minIDVersion=" + minIDVersion);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -444,15 +490,15 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" index: follow label=" + Integer.toHexString((target.bytes[target.offset + targetUpto]&0xff)) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
|
||||
}
|
||||
targetUpto++;
|
||||
|
||||
if (arc.isFinal()) {
|
||||
//if (DEBUG) System.out.println(" arc is final!");
|
||||
if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
|
||||
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
|
||||
if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -466,9 +512,16 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
if (!currentFrame.hasTerms) {
|
||||
termExists = false;
|
||||
term.length = targetUpto;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" FAST NOT_FOUND term=" + brToString(term));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (currentFrame.maxIDVersion < minIDVersion) {
|
||||
// The max version for all terms in this block is lower than the minVersion
|
||||
termExists = false;
|
||||
term.length = targetUpto;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -476,14 +529,19 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
|
||||
final SeekStatus result = currentFrame.scanToTerm(target, true);
|
||||
if (result == SeekStatus.FOUND) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
|
||||
}
|
||||
currentFrame.decodeMetaData();
|
||||
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
|
||||
// The max version for this term is lower than the minVersion
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
|
||||
// }
|
||||
if (DEBUG) {
|
||||
System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -969,4 +1027,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
|
|||
public long ord() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "IDVersionSegmentTermsEnum(seg=" + fr.parent.segment + ")";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ final class IDVersionSegmentTermsEnumFrame {
|
|||
boolean hasTermsOrig;
|
||||
boolean isFloor;
|
||||
|
||||
/** Highest version of any term in this block. */
|
||||
long maxIDVersion;
|
||||
|
||||
FST.Arc<Pair<BytesRef,Long>> arc;
|
||||
|
|
|
@ -27,7 +27,12 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
private int singleDocID;
|
||||
private Bits liveDocs;
|
||||
private long version;
|
||||
private final BytesRef payload = new BytesRef(8);
|
||||
private final BytesRef payload;
|
||||
|
||||
public SingleDocsAndPositionsEnum() {
|
||||
payload = new BytesRef(8);
|
||||
payload.length = 8;
|
||||
}
|
||||
|
||||
/** For reuse */
|
||||
public void reset(int singleDocID, long version, Bits liveDocs) {
|
||||
|
@ -35,7 +40,6 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
this.liveDocs = liveDocs;
|
||||
this.singleDocID = singleDocID;
|
||||
this.version = version;
|
||||
pos = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -45,7 +49,7 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
} else {
|
||||
doc = NO_MORE_DOCS;
|
||||
}
|
||||
pos = 0;
|
||||
pos = -1;
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
@ -59,6 +63,7 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
public int advance(int target) {
|
||||
if (doc == -1 && target <= singleDocID && (liveDocs == null || liveDocs.get(singleDocID))) {
|
||||
doc = singleDocID;
|
||||
pos = -1;
|
||||
} else {
|
||||
doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
|
|
@ -161,7 +161,6 @@ final class VersionBlockTreeTermsReader extends FieldsProducer {
|
|||
in.readBytes(code.bytes, 0, numBytes);
|
||||
code.length = numBytes;
|
||||
final long version = in.readVLong();
|
||||
System.out.println(" read code=" +code + " version=" + version);
|
||||
final Pair<BytesRef,Long> rootCode = VersionBlockTreeTermsWriter.FST_OUTPUTS.newPair(code, version);
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
assert fieldInfo != null: "field=" + field;
|
||||
|
|
|
@ -194,10 +194,10 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
// nocommit fix jdocs
|
||||
final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
||||
|
||||
public static final PairOutputs<BytesRef,Long> FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
|
||||
static final PairOutputs<BytesRef,Long> FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
|
||||
PositiveIntOutputs.getSingleton());
|
||||
|
||||
public static final Pair<BytesRef,Long> NO_OUTPUT = FST_OUTPUTS.getNoOutput();
|
||||
static final Pair<BytesRef,Long> NO_OUTPUT = FST_OUTPUTS.getNoOutput();
|
||||
|
||||
/** Suggested default value for the {@code
|
||||
* minItemsInBlock} parameter to {@link
|
||||
|
@ -284,7 +284,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
private final List<FieldMetaData> fields = new ArrayList<>();
|
||||
// private final String segment;
|
||||
private final String segment;
|
||||
|
||||
/** Create a new writer. The number of items (terms or
|
||||
* sub-blocks) per block will aim to be between
|
||||
|
@ -297,6 +297,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
int maxItemsInBlock)
|
||||
throws IOException
|
||||
{
|
||||
System.out.println("VBTTW minItemsInBlock=" + minItemsInBlock + " maxItemsInBlock=" + maxItemsInBlock);
|
||||
if (minItemsInBlock <= 1) {
|
||||
throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
|
||||
}
|
||||
|
@ -329,7 +330,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
writeIndexHeader(indexOut);
|
||||
|
||||
this.postingsWriter = postingsWriter;
|
||||
// segment = state.segmentName;
|
||||
segment = state.segmentInfo.name;
|
||||
|
||||
// System.out.println("BTW.init seg=" + state.segmentName);
|
||||
|
||||
|
@ -625,6 +626,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
// following floor blocks:
|
||||
|
||||
void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException {
|
||||
// nocommit why can't we do floor blocks for root frame?
|
||||
if (prefixLength == 0 || count <= maxItemsInBlock) {
|
||||
// Easy case: not floor block. Eg, prefix is "foo",
|
||||
// and we found 30 terms/sub-blocks starting w/ that
|
||||
|
@ -644,13 +646,13 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
// in each block, to make floor blocks authoritative
|
||||
|
||||
//if (DEBUG) {
|
||||
// final BytesRef prefix = new BytesRef(prefixLength);
|
||||
// for(int m=0;m<prefixLength;m++) {
|
||||
// prefix.bytes[m] = (byte) prevTerm.ints[m];
|
||||
// }
|
||||
// prefix.length = prefixLength;
|
||||
// //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
|
||||
// System.out.println("writeBlocks: prefix=" + prefix + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
|
||||
final BytesRef prefix = new BytesRef(prefixLength);
|
||||
for(int m=0;m<prefixLength;m++) {
|
||||
prefix.bytes[m] = (byte) prevTerm.ints[m];
|
||||
}
|
||||
prefix.length = prefixLength;
|
||||
//System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
|
||||
System.out.println("writeBlocks: prefix=" + toString(prefix) + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
|
||||
//}
|
||||
//System.out.println("\nwbs count=" + count);
|
||||
|
||||
|
@ -873,7 +875,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
|
||||
System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
|
||||
// }
|
||||
|
||||
// 1st pass: pack term suffix bytes into byte[] blob
|
||||
|
@ -909,6 +911,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
boolean absolute = true;
|
||||
long maxVersionInBlock = -1;
|
||||
|
||||
int countx = 0;
|
||||
if (isLeafBlock) {
|
||||
subIndices = null;
|
||||
for (PendingEntry ent : slice) {
|
||||
|
@ -918,10 +921,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
|
||||
final int suffix = term.term.length - prefixLength;
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytes = new BytesRef(suffix);
|
||||
// System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
// suffixBytes.length = suffix;
|
||||
// System.out.println(" write term suffix=" + suffixBytes);
|
||||
BytesRef suffixBytes = new BytesRef(suffix);
|
||||
System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
suffixBytes.length = suffix;
|
||||
System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes));
|
||||
// }
|
||||
// For leaf block we write suffix straight
|
||||
suffixWriter.writeVInt(suffix);
|
||||
|
@ -955,10 +958,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
|
||||
final int suffix = term.term.length - prefixLength;
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytes = new BytesRef(suffix);
|
||||
// System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
// suffixBytes.length = suffix;
|
||||
// System.out.println(" write term suffix=" + suffixBytes);
|
||||
BytesRef suffixBytes = new BytesRef(suffix);
|
||||
System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
suffixBytes.length = suffix;
|
||||
System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes));
|
||||
// }
|
||||
// For non-leaf block we borrow 1 bit to record
|
||||
// if entry is term or sub-block
|
||||
|
@ -1005,10 +1008,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
assert block.fp < startFP;
|
||||
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytes = new BytesRef(suffix);
|
||||
// System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
// suffixBytes.length = suffix;
|
||||
// System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
|
||||
BytesRef suffixBytes = new BytesRef(suffix);
|
||||
System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
|
||||
suffixBytes.length = suffix;
|
||||
System.out.println(" " + (countx++) + ": write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
|
||||
// }
|
||||
|
||||
suffixWriter.writeVLong(startFP - block.fp);
|
||||
|
|
|
@ -17,16 +17,32 @@ package org.apache.lucene.codecs.idversion;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.index.BasePostingsFormatTestCase;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.PerThreadPKLookup;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -47,26 +63,219 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
|
|||
Document doc = new Document();
|
||||
doc.add(makeIDField("id0", 100));
|
||||
w.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(makeIDField("id1", 110));
|
||||
w.addDocument(doc);
|
||||
IndexReader r = w.getReader();
|
||||
IDVersionSegmentTermsEnum termsEnum = (IDVersionSegmentTermsEnum) r.leaves().get(0).reader().fields().terms("id").iterator(null);
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("id0"), 50));
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("id0"), 100));
|
||||
assertFalse(termsEnum.seekExact(new BytesRef("id0"), 101));
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("id1"), 50));
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("id1"), 110));
|
||||
assertFalse(termsEnum.seekExact(new BytesRef("id1"), 111));
|
||||
r.close();
|
||||
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// nocommit need testRandom
|
||||
// nocommit vary the style of iD; sometimes fixed-length ids, timestamp, zero filled, seuqential, random, etc.
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
// nocommit randomize the block sizes:
|
||||
iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
|
||||
// nocommit put back
|
||||
//RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
int numDocs = atLeast(1000);
|
||||
Map<String,Long> idValues = new HashMap<String,Long>();
|
||||
int docUpto = 0;
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: numDocs=" + numDocs);
|
||||
}
|
||||
long version = 0;
|
||||
while (docUpto < numDocs) {
|
||||
// nocommit add deletes in
|
||||
// nocommit randomRealisticUniode / full binary
|
||||
String idValue = TestUtil.randomSimpleString(random());
|
||||
if (idValues.containsKey(idValue)) {
|
||||
continue;
|
||||
}
|
||||
//long version = random().nextLong() & 0x7fffffffffffffffL;
|
||||
version++;
|
||||
idValues.put(idValue, version);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" " + idValue + " -> " + version);
|
||||
}
|
||||
Document doc = new Document();
|
||||
doc.add(makeIDField(idValue, version));
|
||||
w.addDocument(doc);
|
||||
docUpto++;
|
||||
}
|
||||
|
||||
//IndexReader r = w.getReader();
|
||||
IndexReader r = DirectoryReader.open(w, true);
|
||||
PerThreadVersionPKLookup lookup = new PerThreadVersionPKLookup(r, "id");
|
||||
|
||||
List<Map.Entry<String,Long>> idValuesList = new ArrayList<>(idValues.entrySet());
|
||||
int iters = numDocs * 5;
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
String idValue;
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
idValue = idValuesList.get(random().nextInt(numDocs)).getKey();
|
||||
} else {
|
||||
idValue = TestUtil.randomSimpleString(random());
|
||||
}
|
||||
|
||||
BytesRef idValueBytes = new BytesRef(idValue);
|
||||
|
||||
Long expectedVersion = idValues.get(idValue);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: iter=" + iter + " id=" + idValue + " expectedVersion=" + expectedVersion);
|
||||
}
|
||||
|
||||
if (expectedVersion == null) {
|
||||
assertEquals(-1, lookup.lookup(idValueBytes));
|
||||
} else {
|
||||
if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" lookup exact version (should be found)");
|
||||
}
|
||||
assertTrue(lookup.lookup(idValueBytes, expectedVersion.longValue()) != -1);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" lookup version+1 (should not be found)");
|
||||
}
|
||||
assertEquals(-1, lookup.lookup(idValueBytes, expectedVersion.longValue()+1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private static class PerThreadVersionPKLookup extends PerThreadPKLookup {
|
||||
public PerThreadVersionPKLookup(IndexReader r, String field) throws IOException {
|
||||
super(r, field);
|
||||
}
|
||||
|
||||
/** Returns docID if found, else -1. */
|
||||
public int lookup(BytesRef id, long version) throws IOException {
|
||||
for(int seg=0;seg<numSegs;seg++) {
|
||||
if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" found in seg=" + termsEnums[seg]);
|
||||
}
|
||||
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
|
||||
int docID = docsEnums[seg].nextDoc();
|
||||
if (docID != DocsEnum.NO_MORE_DOCS) {
|
||||
return docBases[seg] + docID;
|
||||
}
|
||||
assert hasDeletions;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/** Produces a single token from the provided value, with the provided payload. */
|
||||
private static class StringAndPayloadField extends Field {
|
||||
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
|
||||
static {
|
||||
TYPE.setIndexed(true);
|
||||
TYPE.setOmitNorms(true);
|
||||
TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
TYPE.setTokenized(true);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
private final BytesRef payload;
|
||||
|
||||
public StringAndPayloadField(String name, String value, BytesRef payload) {
|
||||
super(name, value, TYPE);
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
|
||||
SingleTokenWithPayloadTokenStream ts;
|
||||
if (reuse instanceof SingleTokenWithPayloadTokenStream) {
|
||||
ts = (SingleTokenWithPayloadTokenStream) reuse;
|
||||
} else {
|
||||
ts = new SingleTokenWithPayloadTokenStream();
|
||||
}
|
||||
ts.setValue((String) fieldsData, payload);
|
||||
return ts;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class SingleTokenWithPayloadTokenStream extends TokenStream {
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final PayloadAttribute payloadAttribute = addAttribute(PayloadAttribute.class);
|
||||
private boolean used = false;
|
||||
private String value = null;
|
||||
private BytesRef payload;
|
||||
|
||||
/** Creates a new TokenStream that returns a String+payload as single token.
|
||||
* <p>Warning: Does not initialize the value, you must call
|
||||
* {@link #setValue(String)} afterwards!
|
||||
*/
|
||||
SingleTokenWithPayloadTokenStream() {
|
||||
}
|
||||
|
||||
/** Sets the string value. */
|
||||
void setValue(String value, BytesRef payload) {
|
||||
this.value = value;
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (used) {
|
||||
return false;
|
||||
}
|
||||
clearAttributes();
|
||||
termAttribute.append(value);
|
||||
payloadAttribute.setPayload(payload);
|
||||
used = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
used = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
value = null;
|
||||
payload = null;
|
||||
}
|
||||
}
|
||||
|
||||
private static Field makeIDField(String id, long version) {
|
||||
Field field = newTextField("id", "", Field.Store.NO);
|
||||
Token token = new Token(id, 0, id.length());
|
||||
BytesRef payload = new BytesRef(8);
|
||||
payload.length = 8;
|
||||
IDVersionPostingsFormat.longToBytes(100, payload);
|
||||
IDVersionPostingsFormat.longToBytes(version, payload);
|
||||
return new StringAndPayloadField("id", id, payload);
|
||||
|
||||
/*
|
||||
Field field = newTextField("id", "", Field.Store.NO);
|
||||
Token token = new Token(id, 0, id.length());
|
||||
token.setPayload(payload);
|
||||
field.setTokenStream(new CannedTokenStream(token));
|
||||
return field;
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
|||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.NoOutputs;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
@ -189,6 +190,10 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*/
|
||||
public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||
|
||||
static final Outputs<BytesRef> FST_OUTPUTS = ByteSequenceOutputs.getSingleton();
|
||||
|
||||
static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput();
|
||||
|
||||
/** Suggested default value for the {@code
|
||||
* minItemsInBlock} parameter to {@link
|
||||
* #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
|
||||
|
|
|
@ -40,9 +40,6 @@ import org.apache.lucene.util.fst.Util;
|
|||
/** Iterates through terms in this field */
|
||||
final class SegmentTermsEnum extends TermsEnum {
|
||||
|
||||
final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
|
||||
final static BytesRef NO_OUTPUT = fstOutputs.getNoOutput();
|
||||
|
||||
// Lazy init:
|
||||
IndexInput in;
|
||||
|
||||
|
@ -366,8 +363,8 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
//System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF));
|
||||
//}
|
||||
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
|
||||
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
|
||||
}
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1+lastFrame.ord];
|
||||
|
@ -457,7 +454,7 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
|
||||
//term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
|
||||
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -512,8 +509,8 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
term.bytes[targetUpto] = (byte) targetLabel;
|
||||
// Aggregate output as we go:
|
||||
assert arc.output != null;
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
|
||||
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -523,7 +520,7 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
//if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
|
||||
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
|
||||
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
|
||||
}
|
||||
}
|
||||
|
@ -628,8 +625,8 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
// seek; but, often the FST doesn't have any
|
||||
// shared bytes (but this could change if we
|
||||
// reverse vLong byte order)
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
|
||||
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
|
||||
}
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1+lastFrame.ord];
|
||||
|
@ -714,7 +711,7 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
|
||||
//term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
|
||||
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
|
||||
}
|
||||
|
||||
//if (DEBUG) {
|
||||
|
@ -769,8 +766,8 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
arc = nextArc;
|
||||
// Aggregate output as we go:
|
||||
assert arc.output != null;
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
|
||||
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
|
||||
}
|
||||
|
||||
//if (DEBUG) {
|
||||
|
@ -780,7 +777,7 @@ final class SegmentTermsEnum extends TermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
//if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
|
||||
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
|
||||
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -966,74 +966,4 @@ public class TestTermsEnum extends LuceneTestCase {
|
|||
w.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
/** Utility class to do efficient primary-key (only 1 doc contains the
|
||||
* given term) lookups by segment, re-using the enums. This class is
|
||||
* not thread safe, so it is the caller's job to create and use one
|
||||
* instance of this per thread. Do not use this if a term may appear
|
||||
* in more than one document! It will only return the first one it
|
||||
* finds. */
|
||||
static class PerThreadPKLookup {
|
||||
|
||||
private final TermsEnum[] termsEnums;
|
||||
private final DocsEnum[] docsEnums;
|
||||
private final Bits[] liveDocs;
|
||||
private final int[] docBases;
|
||||
private final int numSegs;
|
||||
private final boolean hasDeletions;
|
||||
|
||||
public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException {
|
||||
|
||||
List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
|
||||
|
||||
// Larger segments are more likely to have the id, so we sort largest to smallest by numDocs:
|
||||
Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
|
||||
@Override
|
||||
public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
|
||||
return c2.reader().numDocs() - c1.reader().numDocs();
|
||||
}
|
||||
});
|
||||
|
||||
termsEnums = new TermsEnum[leaves.size()];
|
||||
docsEnums = new DocsEnum[leaves.size()];
|
||||
liveDocs = new Bits[leaves.size()];
|
||||
docBases = new int[leaves.size()];
|
||||
int numSegs = 0;
|
||||
boolean hasDeletions = false;
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
Fields fields = leaves.get(i).reader().fields();
|
||||
if (fields != null) {
|
||||
Terms terms = fields.terms(idFieldName);
|
||||
if (terms != null) {
|
||||
termsEnums[numSegs] = terms.iterator(null);
|
||||
assert termsEnums[numSegs] != null;
|
||||
docBases[numSegs] = leaves.get(i).docBase;
|
||||
liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs();
|
||||
hasDeletions |= leaves.get(i).reader().hasDeletions();
|
||||
numSegs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.numSegs = numSegs;
|
||||
this.hasDeletions = hasDeletions;
|
||||
}
|
||||
|
||||
/** Returns docID if found, else -1. */
|
||||
public int lookup(BytesRef id) throws IOException {
|
||||
for(int seg=0;seg<numSegs;seg++) {
|
||||
if (termsEnums[seg].seekExact(id)) {
|
||||
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
|
||||
int docID = docsEnums[seg].nextDoc();
|
||||
if (docID != DocsEnum.NO_MORE_DOCS) {
|
||||
return docBases[seg] + docID;
|
||||
}
|
||||
assert hasDeletions;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO: add reopen method to carry over re-used enums...?
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Utility class to do efficient primary-key (only 1 doc contains the
|
||||
* given term) lookups by segment, re-using the enums. This class is
|
||||
* not thread safe, so it is the caller's job to create and use one
|
||||
* instance of this per thread. Do not use this if a term may appear
|
||||
* in more than one document! It will only return the first one it
|
||||
* finds. */
|
||||
public class PerThreadPKLookup {
|
||||
|
||||
protected final TermsEnum[] termsEnums;
|
||||
protected final DocsEnum[] docsEnums;
|
||||
protected final Bits[] liveDocs;
|
||||
protected final int[] docBases;
|
||||
protected final int numSegs;
|
||||
protected final boolean hasDeletions;
|
||||
|
||||
public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException {
|
||||
|
||||
List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
|
||||
|
||||
// Larger segments are more likely to have the id, so we sort largest to smallest by numDocs:
|
||||
Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
|
||||
@Override
|
||||
public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
|
||||
return c2.reader().numDocs() - c1.reader().numDocs();
|
||||
}
|
||||
});
|
||||
|
||||
termsEnums = new TermsEnum[leaves.size()];
|
||||
docsEnums = new DocsEnum[leaves.size()];
|
||||
liveDocs = new Bits[leaves.size()];
|
||||
docBases = new int[leaves.size()];
|
||||
int numSegs = 0;
|
||||
boolean hasDeletions = false;
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
Fields fields = leaves.get(i).reader().fields();
|
||||
if (fields != null) {
|
||||
Terms terms = fields.terms(idFieldName);
|
||||
if (terms != null) {
|
||||
termsEnums[numSegs] = terms.iterator(null);
|
||||
assert termsEnums[numSegs] != null;
|
||||
docBases[numSegs] = leaves.get(i).docBase;
|
||||
liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs();
|
||||
hasDeletions |= leaves.get(i).reader().hasDeletions();
|
||||
numSegs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.numSegs = numSegs;
|
||||
this.hasDeletions = hasDeletions;
|
||||
}
|
||||
|
||||
/** Returns docID if found, else -1. */
|
||||
public int lookup(BytesRef id) throws IOException {
|
||||
for(int seg=0;seg<numSegs;seg++) {
|
||||
if (termsEnums[seg].seekExact(id)) {
|
||||
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
|
||||
int docID = docsEnums[seg].nextDoc();
|
||||
if (docID != DocsEnum.NO_MORE_DOCS) {
|
||||
return docBases[seg] + docID;
|
||||
}
|
||||
assert hasDeletions;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO: add reopen method to carry over re-used enums...?
|
||||
}
|
Loading…
Reference in New Issue