LUCENE-5675: add testRandom; sometimes fails

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1595229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-16 15:17:30 +00:00
parent 83332c046b
commit fa51d5972a
10 changed files with 488 additions and 179 deletions

View File

@ -45,6 +45,8 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
// Lazy init:
IndexInput in;
private static boolean DEBUG = true;
private IDVersionSegmentTermsEnumFrame[] stack;
private final IDVersionSegmentTermsEnumFrame staticFrame;
IDVersionSegmentTermsEnumFrame currentFrame;
@ -214,13 +216,27 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
return seekExact(target, 0);
}
// for debugging
@SuppressWarnings("unused")
private String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
/** Returns false if the term deos not exist, or it exists but its version is < minIDVersion. */
public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
System.out.println("seekExact target=" + target + " minIDVersion=" + minIDVersion);
// nocommit would be nice if somehow on doing deletes we didn't have to double-lookup again...
if (term.bytes.length <= target.length) {
term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
@ -228,10 +244,10 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
assert clearEOF();
// if (DEBUG) {
// System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
// printSeekState();
// }
if (DEBUG) {
System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " minIDVersion=" + minIDVersion + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
printSeekState(System.out);
}
FST.Arc<Pair<BytesRef,Long>> arc;
int targetUpto;
@ -239,6 +255,8 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
targetBeforeCurrentLength = currentFrame.ord;
// nocommit we could stop earlier w/ the version check, every time we traverse an index arc we can check?
if (currentFrame != staticFrame) {
// We are already seek'd; find the common
@ -248,9 +266,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
// seeks to foobaz, we can re-use the seek state
// for the first 5 bytes.
// if (DEBUG) {
// System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
// }
if (DEBUG) {
System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
}
arc = arcs[0];
assert arc.isFinal();
@ -258,7 +276,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
targetUpto = 0;
IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
assert validIndexPrefix <= term.length;
assert validIndexPrefix <= term.length: "validIndexPrefix=" + validIndexPrefix + " term.length=" + term.length + " seg=" + fr.parent.segment;
final int targetLimit = Math.min(target.length, validIndexPrefix);
@ -270,9 +288,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
// if (DEBUG) {
// System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
// }
if (DEBUG) {
System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
}
if (cmp != 0) {
break;
}
@ -300,9 +318,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
final int targetLimit2 = Math.min(target.length, term.length);
while (targetUpto < targetLimit2) {
cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
// if (DEBUG) {
// System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
// }
if (DEBUG) {
System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
}
if (cmp != 0) {
break;
}
@ -319,9 +337,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
// Common case: target term is after current
// term, ie, app is seeking multiple terms
// in sorted order
// if (DEBUG) {
// System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
// }
if (DEBUG) {
System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord + "; targetUpto=" + targetUpto);
}
currentFrame = lastFrame;
} else if (cmp > 0) {
@ -330,23 +348,41 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
// keep the currentFrame but we must rewind it
// (so we scan from the start)
targetBeforeCurrentLength = 0;
// if (DEBUG) {
// System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
// }
if (DEBUG) {
System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
}
currentFrame = lastFrame;
currentFrame.rewind();
} else {
// Target is exactly the same as current term
assert term.length == target.length;
if (termExists) {
// if (DEBUG) {
// System.out.println(" target is same as current; return true");
// }
if (currentFrame.maxIDVersion < minIDVersion) {
// The max version for all terms in this block is lower than the minVersion
if (DEBUG) {
System.out.println(" target is same as current maxIDVersion=" + currentFrame.maxIDVersion + " is < minIDVersion=" + minIDVersion + "; return false");
}
return false;
}
currentFrame.decodeMetaData();
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
// The max version for this term is lower than the minVersion
if (DEBUG) {
System.out.println(" target is same as current but version=" + ((IDVersionTermState) currentFrame.state).idVersion + " is < minIDVersion=" + minIDVersion + "; return false");
}
return false;
}
if (DEBUG) {
System.out.println(" target is same as current; return true");
}
return true;
} else {
// if (DEBUG) {
// System.out.println(" target is same as current but term doesn't exist");
// }
if (DEBUG) {
System.out.println(" target is same as current but term doesn't exist");
}
}
//validIndexPrefix = currentFrame.depth;
//term.length = target.length;
@ -357,15 +393,15 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
targetBeforeCurrentLength = -1;
arc = fr.index.getFirstArc(arcs[0]);
System.out.println("first arc=" + arc);
//System.out.println("first arc=" + arc);
// Empty string prefix must have an output (block) in the index!
assert arc.isFinal();
assert arc.output != null;
// if (DEBUG) {
// System.out.println(" no seek state; push root frame");
// }
if (DEBUG) {
System.out.println(" no seek state; push root frame");
}
output = arc.output;
@ -376,9 +412,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
// if (DEBUG) {
// System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
// }
if (DEBUG) {
System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
}
while (targetUpto < target.length) {
@ -389,9 +425,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
if (nextArc == null) {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// }
if (DEBUG) {
System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + Integer.toHexString(targetLabel));
}
validIndexPrefix = currentFrame.prefix;
//validIndexPrefix = targetUpto;
@ -402,15 +438,21 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
termExists = false;
term.bytes[targetUpto] = (byte) targetLabel;
term.length = 1+targetUpto;
// if (DEBUG) {
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
// }
if (DEBUG) {
System.out.println(" FAST NOT_FOUND term=" + brToString(term));
}
return false;
}
System.out.println(" check output=" +((output.output2)));
//System.out.println(" check maxVersion=" + currentFrame.maxIDVersion + " vs " + minIDVersion);
if (currentFrame.maxIDVersion < minIDVersion) {
// The max version for all terms in this block is lower than the minVersion
//termExists = false;
//term.bytes[targetUpto] = (byte) targetLabel;
//term.length = 1+targetUpto;
if (DEBUG) {
System.out.println(" FAST version NOT_FOUND term=" + brToString(term) + " currentFrame.maxIDVersion=" + currentFrame.maxIDVersion + " validIndexPrefix=" + validIndexPrefix);
}
return false;
}
@ -418,20 +460,24 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// if (DEBUG) {
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
// }
currentFrame.decodeMetaData();
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
// The max version for this term is lower than the minVersion
if (DEBUG) {
System.out.println(" return NOT_FOUND: idVersion=" + ((IDVersionTermState) currentFrame.state).idVersion + " vs minIDVersion=" + minIDVersion);
}
return false;
}
if (DEBUG) {
System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
}
return true;
} else {
// if (DEBUG) {
// System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
// }
if (DEBUG) {
System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
}
return false;
}
} else {
@ -444,15 +490,15 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
// }
if (DEBUG) {
System.out.println(" index: follow label=" + Integer.toHexString((target.bytes[target.offset + targetUpto]&0xff)) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
}
targetUpto++;
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
}
@ -466,9 +512,16 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
if (!currentFrame.hasTerms) {
termExists = false;
term.length = targetUpto;
// if (DEBUG) {
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
// }
if (DEBUG) {
System.out.println(" FAST NOT_FOUND term=" + brToString(term));
}
return false;
}
if (currentFrame.maxIDVersion < minIDVersion) {
// The max version for all terms in this block is lower than the minVersion
termExists = false;
term.length = targetUpto;
return false;
}
@ -476,14 +529,19 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// if (DEBUG) {
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
// }
if (DEBUG) {
System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
}
currentFrame.decodeMetaData();
if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
// The max version for this term is lower than the minVersion
return false;
}
return true;
} else {
// if (DEBUG) {
// System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
// }
if (DEBUG) {
System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
}
return false;
}
@ -969,4 +1027,9 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
return "IDVersionSegmentTermsEnum(seg=" + fr.parent.segment + ")";
}
}

View File

@ -36,6 +36,7 @@ final class IDVersionSegmentTermsEnumFrame {
boolean hasTermsOrig;
boolean isFloor;
/** Highest version of any term in this block. */
long maxIDVersion;
FST.Arc<Pair<BytesRef,Long>> arc;

View File

@ -27,7 +27,12 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
private int singleDocID;
private Bits liveDocs;
private long version;
private final BytesRef payload = new BytesRef(8);
private final BytesRef payload;
public SingleDocsAndPositionsEnum() {
payload = new BytesRef(8);
payload.length = 8;
}
/** For reuse */
public void reset(int singleDocID, long version, Bits liveDocs) {
@ -35,7 +40,6 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
this.liveDocs = liveDocs;
this.singleDocID = singleDocID;
this.version = version;
pos = -1;
}
@Override
@ -45,7 +49,7 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
} else {
doc = NO_MORE_DOCS;
}
pos = 0;
pos = -1;
return doc;
}
@ -59,6 +63,7 @@ class SingleDocsAndPositionsEnum extends DocsAndPositionsEnum {
public int advance(int target) {
if (doc == -1 && target <= singleDocID && (liveDocs == null || liveDocs.get(singleDocID))) {
doc = singleDocID;
pos = -1;
} else {
doc = NO_MORE_DOCS;
}

View File

@ -161,7 +161,6 @@ final class VersionBlockTreeTermsReader extends FieldsProducer {
in.readBytes(code.bytes, 0, numBytes);
code.length = numBytes;
final long version = in.readVLong();
System.out.println(" read code=" +code + " version=" + version);
final Pair<BytesRef,Long> rootCode = VersionBlockTreeTermsWriter.FST_OUTPUTS.newPair(code, version);
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
assert fieldInfo != null: "field=" + field;

View File

@ -194,10 +194,10 @@ import org.apache.lucene.util.packed.PackedInts;
// nocommit fix jdocs
final class VersionBlockTreeTermsWriter extends FieldsConsumer {
public static final PairOutputs<BytesRef,Long> FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
static final PairOutputs<BytesRef,Long> FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
PositiveIntOutputs.getSingleton());
public static final Pair<BytesRef,Long> NO_OUTPUT = FST_OUTPUTS.getNoOutput();
static final Pair<BytesRef,Long> NO_OUTPUT = FST_OUTPUTS.getNoOutput();
/** Suggested default value for the {@code
* minItemsInBlock} parameter to {@link
@ -284,7 +284,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
}
private final List<FieldMetaData> fields = new ArrayList<>();
// private final String segment;
private final String segment;
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
@ -297,6 +297,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
int maxItemsInBlock)
throws IOException
{
System.out.println("VBTTW minItemsInBlock=" + minItemsInBlock + " maxItemsInBlock=" + maxItemsInBlock);
if (minItemsInBlock <= 1) {
throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
}
@ -329,7 +330,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
writeIndexHeader(indexOut);
this.postingsWriter = postingsWriter;
// segment = state.segmentName;
segment = state.segmentInfo.name;
// System.out.println("BTW.init seg=" + state.segmentName);
@ -625,6 +626,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// following floor blocks:
void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException {
// nocommit why can't we do floor blocks for root frame?
if (prefixLength == 0 || count <= maxItemsInBlock) {
// Easy case: not floor block. Eg, prefix is "foo",
// and we found 30 terms/sub-blocks starting w/ that
@ -644,13 +646,13 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// in each block, to make floor blocks authoritative
//if (DEBUG) {
// final BytesRef prefix = new BytesRef(prefixLength);
// for(int m=0;m<prefixLength;m++) {
// prefix.bytes[m] = (byte) prevTerm.ints[m];
// }
// prefix.length = prefixLength;
// //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
// System.out.println("writeBlocks: prefix=" + prefix + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
final BytesRef prefix = new BytesRef(prefixLength);
for(int m=0;m<prefixLength;m++) {
prefix.bytes[m] = (byte) prevTerm.ints[m];
}
prefix.length = prefixLength;
//System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
System.out.println("writeBlocks: prefix=" + toString(prefix) + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
//}
//System.out.println("\nwbs count=" + count);
@ -873,7 +875,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
// if (DEBUG) {
// System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
// }
// 1st pass: pack term suffix bytes into byte[] blob
@ -909,6 +911,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
boolean absolute = true;
long maxVersionInBlock = -1;
int countx = 0;
if (isLeafBlock) {
subIndices = null;
for (PendingEntry ent : slice) {
@ -918,10 +921,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
final int suffix = term.term.length - prefixLength;
// if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write term suffix=" + suffixBytes);
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes));
// }
// For leaf block we write suffix straight
suffixWriter.writeVInt(suffix);
@ -955,10 +958,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
final int suffix = term.term.length - prefixLength;
// if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write term suffix=" + suffixBytes);
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes));
// }
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block
@ -1005,10 +1008,10 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
assert block.fp < startFP;
// if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" " + (countx++) + ": write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
// }
suffixWriter.writeVLong(startFP - block.fp);

View File

@ -17,16 +17,32 @@ package org.apache.lucene.codecs.idversion;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.BasePostingsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.PerThreadPKLookup;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
@ -47,26 +63,219 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
Document doc = new Document();
doc.add(makeIDField("id0", 100));
w.addDocument(doc);
doc = new Document();
doc.add(makeIDField("id1", 110));
w.addDocument(doc);
IndexReader r = w.getReader();
IDVersionSegmentTermsEnum termsEnum = (IDVersionSegmentTermsEnum) r.leaves().get(0).reader().fields().terms("id").iterator(null);
assertTrue(termsEnum.seekExact(new BytesRef("id0"), 50));
assertTrue(termsEnum.seekExact(new BytesRef("id0"), 100));
assertFalse(termsEnum.seekExact(new BytesRef("id0"), 101));
assertTrue(termsEnum.seekExact(new BytesRef("id1"), 50));
assertTrue(termsEnum.seekExact(new BytesRef("id1"), 110));
assertFalse(termsEnum.seekExact(new BytesRef("id1"), 111));
r.close();
w.close();
dir.close();
}
// nocommit need testRandom
// nocommit vary the style of iD; sometimes fixed-length ids, timestamp, zero filled, seuqential, random, etc.
public void testRandom() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
// nocommit randomize the block sizes:
iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
// nocommit put back
//RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
IndexWriter w = new IndexWriter(dir, iwc);
int numDocs = atLeast(1000);
Map<String,Long> idValues = new HashMap<String,Long>();
int docUpto = 0;
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs);
}
long version = 0;
while (docUpto < numDocs) {
// nocommit add deletes in
// nocommit randomRealisticUniode / full binary
String idValue = TestUtil.randomSimpleString(random());
if (idValues.containsKey(idValue)) {
continue;
}
//long version = random().nextLong() & 0x7fffffffffffffffL;
version++;
idValues.put(idValue, version);
if (VERBOSE) {
System.out.println(" " + idValue + " -> " + version);
}
Document doc = new Document();
doc.add(makeIDField(idValue, version));
w.addDocument(doc);
docUpto++;
}
//IndexReader r = w.getReader();
IndexReader r = DirectoryReader.open(w, true);
PerThreadVersionPKLookup lookup = new PerThreadVersionPKLookup(r, "id");
List<Map.Entry<String,Long>> idValuesList = new ArrayList<>(idValues.entrySet());
int iters = numDocs * 5;
for(int iter=0;iter<iters;iter++) {
String idValue;
if (random().nextBoolean()) {
idValue = idValuesList.get(random().nextInt(numDocs)).getKey();
} else {
idValue = TestUtil.randomSimpleString(random());
}
BytesRef idValueBytes = new BytesRef(idValue);
Long expectedVersion = idValues.get(idValue);
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " id=" + idValue + " expectedVersion=" + expectedVersion);
}
if (expectedVersion == null) {
assertEquals(-1, lookup.lookup(idValueBytes));
} else {
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println(" lookup exact version (should be found)");
}
assertTrue(lookup.lookup(idValueBytes, expectedVersion.longValue()) != -1);
} else {
if (VERBOSE) {
System.out.println(" lookup version+1 (should not be found)");
}
assertEquals(-1, lookup.lookup(idValueBytes, expectedVersion.longValue()+1));
}
}
}
r.close();
w.close();
dir.close();
}
private static class PerThreadVersionPKLookup extends PerThreadPKLookup {
public PerThreadVersionPKLookup(IndexReader r, String field) throws IOException {
super(r, field);
}
/** Returns docID if found, else -1. */
public int lookup(BytesRef id, long version) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
if (VERBOSE) {
System.out.println(" found in seg=" + termsEnums[seg]);
}
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
int docID = docsEnums[seg].nextDoc();
if (docID != DocsEnum.NO_MORE_DOCS) {
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
}
/** Produces a single token from the provided value, with the provided payload. */
private static class StringAndPayloadField extends Field {
public static final FieldType TYPE = new FieldType();
static {
TYPE.setIndexed(true);
TYPE.setOmitNorms(true);
TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
TYPE.setTokenized(true);
TYPE.freeze();
}
private final BytesRef payload;
public StringAndPayloadField(String name, String value, BytesRef payload) {
super(name, value, TYPE);
this.payload = payload;
}
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
SingleTokenWithPayloadTokenStream ts;
if (reuse instanceof SingleTokenWithPayloadTokenStream) {
ts = (SingleTokenWithPayloadTokenStream) reuse;
} else {
ts = new SingleTokenWithPayloadTokenStream();
}
ts.setValue((String) fieldsData, payload);
return ts;
}
}
private static final class SingleTokenWithPayloadTokenStream extends TokenStream {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final PayloadAttribute payloadAttribute = addAttribute(PayloadAttribute.class);
private boolean used = false;
private String value = null;
private BytesRef payload;
/** Creates a new TokenStream that returns a String+payload as single token.
* <p>Warning: Does not initialize the value, you must call
* {@link #setValue(String)} afterwards!
*/
SingleTokenWithPayloadTokenStream() {
}
/** Sets the string value. */
void setValue(String value, BytesRef payload) {
this.value = value;
this.payload = payload;
}
@Override
public boolean incrementToken() {
if (used) {
return false;
}
clearAttributes();
termAttribute.append(value);
payloadAttribute.setPayload(payload);
used = true;
return true;
}
@Override
public void reset() {
used = false;
}
@Override
public void close() {
value = null;
payload = null;
}
}
private static Field makeIDField(String id, long version) {
Field field = newTextField("id", "", Field.Store.NO);
Token token = new Token(id, 0, id.length());
BytesRef payload = new BytesRef(8);
payload.length = 8;
IDVersionPostingsFormat.longToBytes(100, payload);
IDVersionPostingsFormat.longToBytes(version, payload);
return new StringAndPayloadField("id", id, payload);
/*
Field field = newTextField("id", "", Field.Store.NO);
Token token = new Token(id, 0, id.length());
token.setPayload(payload);
field.setTokenStream(new CannedTokenStream(token));
return field;
*/
}
}

View File

@ -46,6 +46,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
@ -189,6 +190,10 @@ import org.apache.lucene.util.packed.PackedInts;
*/
public final class BlockTreeTermsWriter extends FieldsConsumer {
static final Outputs<BytesRef> FST_OUTPUTS = ByteSequenceOutputs.getSingleton();
static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput();
/** Suggested default value for the {@code
* minItemsInBlock} parameter to {@link
* #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */

View File

@ -40,9 +40,6 @@ import org.apache.lucene.util.fst.Util;
/** Iterates through terms in this field */
final class SegmentTermsEnum extends TermsEnum {
final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
final static BytesRef NO_OUTPUT = fstOutputs.getNoOutput();
// Lazy init:
IndexInput in;
@ -366,8 +363,8 @@ final class SegmentTermsEnum extends TermsEnum {
//System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF));
//}
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
@ -457,7 +454,7 @@ final class SegmentTermsEnum extends TermsEnum {
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
// if (DEBUG) {
@ -512,8 +509,8 @@ final class SegmentTermsEnum extends TermsEnum {
term.bytes[targetUpto] = (byte) targetLabel;
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
// if (DEBUG) {
@ -523,7 +520,7 @@ final class SegmentTermsEnum extends TermsEnum {
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
@ -628,8 +625,8 @@ final class SegmentTermsEnum extends TermsEnum {
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
@ -714,7 +711,7 @@ final class SegmentTermsEnum extends TermsEnum {
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
//if (DEBUG) {
@ -769,8 +766,8 @@ final class SegmentTermsEnum extends TermsEnum {
arc = nextArc;
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
//if (DEBUG) {
@ -780,7 +777,7 @@ final class SegmentTermsEnum extends TermsEnum {
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}

View File

@ -966,74 +966,4 @@ public class TestTermsEnum extends LuceneTestCase {
w.close();
d.close();
}
/** Utility class to do efficient primary-key (only 1 doc contains the
* given term) lookups by segment, re-using the enums. This class is
* not thread safe, so it is the caller's job to create and use one
* instance of this per thread. Do not use this if a term may appear
* in more than one document! It will only return the first one it
* finds. */
static class PerThreadPKLookup {
private final TermsEnum[] termsEnums;
private final DocsEnum[] docsEnums;
private final Bits[] liveDocs;
private final int[] docBases;
private final int numSegs;
private final boolean hasDeletions;
public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException {
List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
// Larger segments are more likely to have the id, so we sort largest to smallest by numDocs:
Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
@Override
public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
return c2.reader().numDocs() - c1.reader().numDocs();
}
});
termsEnums = new TermsEnum[leaves.size()];
docsEnums = new DocsEnum[leaves.size()];
liveDocs = new Bits[leaves.size()];
docBases = new int[leaves.size()];
int numSegs = 0;
boolean hasDeletions = false;
for(int i=0;i<leaves.size();i++) {
Fields fields = leaves.get(i).reader().fields();
if (fields != null) {
Terms terms = fields.terms(idFieldName);
if (terms != null) {
termsEnums[numSegs] = terms.iterator(null);
assert termsEnums[numSegs] != null;
docBases[numSegs] = leaves.get(i).docBase;
liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs();
hasDeletions |= leaves.get(i).reader().hasDeletions();
numSegs++;
}
}
}
this.numSegs = numSegs;
this.hasDeletions = hasDeletions;
}
/** Returns docID if found, else -1. */
public int lookup(BytesRef id) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (termsEnums[seg].seekExact(id)) {
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
int docID = docsEnums[seg].nextDoc();
if (docID != DocsEnum.NO_MORE_DOCS) {
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
// TODO: add reopen method to carry over re-used enums...?
}
}

View File

@ -0,0 +1,97 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** Utility class to do efficient primary-key (only 1 doc contains the
* given term) lookups by segment, re-using the enums. This class is
* not thread safe, so it is the caller's job to create and use one
* instance of this per thread. Do not use this if a term may appear
* in more than one document! It will only return the first one it
* finds. */
public class PerThreadPKLookup {
protected final TermsEnum[] termsEnums;
protected final DocsEnum[] docsEnums;
protected final Bits[] liveDocs;
protected final int[] docBases;
protected final int numSegs;
protected final boolean hasDeletions;
public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException {
List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
// Larger segments are more likely to have the id, so we sort largest to smallest by numDocs:
Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
@Override
public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
return c2.reader().numDocs() - c1.reader().numDocs();
}
});
termsEnums = new TermsEnum[leaves.size()];
docsEnums = new DocsEnum[leaves.size()];
liveDocs = new Bits[leaves.size()];
docBases = new int[leaves.size()];
int numSegs = 0;
boolean hasDeletions = false;
for(int i=0;i<leaves.size();i++) {
Fields fields = leaves.get(i).reader().fields();
if (fields != null) {
Terms terms = fields.terms(idFieldName);
if (terms != null) {
termsEnums[numSegs] = terms.iterator(null);
assert termsEnums[numSegs] != null;
docBases[numSegs] = leaves.get(i).docBase;
liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs();
hasDeletions |= leaves.get(i).reader().hasDeletions();
numSegs++;
}
}
}
this.numSegs = numSegs;
this.hasDeletions = hasDeletions;
}
/** Returns docID if found, else -1. */
public int lookup(BytesRef id) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (termsEnums[seg].seekExact(id)) {
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
int docID = docsEnums[seg].nextDoc();
if (docID != DocsEnum.NO_MORE_DOCS) {
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
// TODO: add reopen method to carry over re-used enums...?
}