ord=-1 now means 'missing' from DocTermsIndex

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1411469 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-11-19 22:40:51 +00:00
parent 7c7898fc12
commit 7e4bb062e4
12 changed files with 129 additions and 280 deletions

View File

@ -25,6 +25,11 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
// nocommit add javadocs stating that this must open all
// necessary files "on init", not later eg in .getXXX, else
// an IW that deletes a commit will cause an SR to hit
// exceptions....
public abstract class SimpleDVProducer implements Closeable {
public abstract NumericDocValues getNumeric(FieldInfo field) throws IOException;

View File

@ -519,12 +519,14 @@ public interface FieldCache {
// nocommit: can we merge this api with the SortedDocValues api?
public abstract static class DocTermsIndex {
// nocommit remove this?
public int binarySearchLookup(BytesRef key, BytesRef spare) {
// this special case is the reason that Arrays.binarySearch() isn't useful.
if (key == null)
return 0;
int low = 1;
if (key == null) {
return -1;
}
int low = 0;
int high = numOrd()-1;
while (low <= high) {
@ -543,24 +545,26 @@ public interface FieldCache {
/** The BytesRef argument must not be null; the method
* returns the same BytesRef, or an empty (length=0)
* BytesRef if this ord is the null ord (0). */
* BytesRef if this ord is the null ord (-1). */
public abstract BytesRef lookup(int ord, BytesRef reuse);
/** Convenience method, to lookup the Term for a doc.
* If this doc is deleted or did not have this field,
* this will return an empty (length=0) BytesRef. */
public BytesRef getTerm(int docID, BytesRef reuse) {
return lookup(getOrd(docID), reuse);
int ord = getOrd(docID);
if (ord == -1) {
return null;
}
return lookup(ord, reuse);
}
/** Returns sort ord for this document. Ord 0 is
* reserved for docs that are deleted or did not have
/** Returns sort ord for this document. Ord -1 is
* is returend for docs that are deleted or did not have
* this field. */
public abstract int getOrd(int docID);
/** Returns total unique ord count; this includes +1 for
* the null ord (always 0) unless the field was
* indexed with doc values. */
/** Returns total unique ord count. */
public abstract int numOrd();
/** Number of documents */
@ -568,9 +572,6 @@ public interface FieldCache {
/** Returns a TermsEnum that can iterate over the values in this index entry */
public abstract TermsEnum getTermsEnum();
/** @lucene.internal */
public abstract PackedInts.Reader getDocToOrd();
}
/** Checks the internal cache for an appropriate entry, and if none

View File

@ -335,9 +335,6 @@ class FieldCacheImpl implements FieldCache {
}
}
}
} else {
// nocommit is this right ...
docsWithField = new Bits.MatchNoBits(maxDoc);
}
}
@ -970,11 +967,6 @@ class FieldCacheImpl implements FieldCache {
this.numOrd = numOrd;
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToTermOrd;
}
@Override
public int numOrd() {
return numOrd;
@ -982,7 +974,7 @@ class FieldCacheImpl implements FieldCache {
@Override
public int getOrd(int docID) {
return (int) docToTermOrd.get(docID);
return (int) docToTermOrd.get(docID)-1;
}
@Override
@ -1010,17 +1002,17 @@ class FieldCacheImpl implements FieldCache {
final BytesRef term = new BytesRef();
public DocTermsIndexEnum() {
currentOrd = 0;
currentOrd = -1;
currentBlockNumber = 0;
blocks = bytes.getBlocks();
blockEnds = bytes.getBlockEnds();
currentBlockNumber = bytes.fillAndGetIndex(term, termOrdToBytesOffset.get(0));
term.bytes = blocks[0];
end = blockEnds[currentBlockNumber];
}
@Override
public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException {
int low = 1;
int low = 0;
int high = numOrd-1;
while (low <= high) {
@ -1032,8 +1024,9 @@ class FieldCacheImpl implements FieldCache {
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
else {
return SeekStatus.FOUND; // key found
}
}
if (low == numOrd) {
@ -1045,7 +1038,7 @@ class FieldCacheImpl implements FieldCache {
}
public void seekExact(long ord) throws IOException {
assert(ord >= 0 && ord <= numOrd);
assert ord >= 0 && ord <= numOrd;
// TODO: if gap is small, could iterate from current position? Or let user decide that?
currentBlockNumber = bytes.fillAndGetIndex(term, termOrdToBytesOffset.get((int)ord));
end = blockEnds[currentBlockNumber];
@ -1057,14 +1050,18 @@ class FieldCacheImpl implements FieldCache {
int start = term.offset + term.length;
if (start >= end) {
// switch byte blocks
if (currentBlockNumber +1 >= blocks.length) {
if (currentBlockNumber+1 >= blocks.length) {
assert currentOrd+1 == numOrd: "currentOrd=" + currentOrd + " numOrd=" + numOrd;
return null;
}
currentBlockNumber++;
term.bytes = blocks[currentBlockNumber];
end = blockEnds[currentBlockNumber];
start = 0;
if (end<=0) return null; // special case of empty last array
if (end<=0) {
assert currentOrd+1 == numOrd;
return null; // special case of empty last array
}
}
currentOrd++;
@ -1131,6 +1128,12 @@ class FieldCacheImpl implements FieldCache {
}
}
// nocommit for DV if you ask for sorted or binary we
// should check sorted first?
// nocommit woudl be nice if .getTErms would return a
// DocTermsIndex if one already existed
public DocTermsIndex getTermsIndex(AtomicReader reader, String field) throws IOException {
return getTermsIndex(reader, field, PackedInts.FAST);
}
@ -1180,12 +1183,6 @@ class FieldCacheImpl implements FieldCache {
// nocommit: to the codec api? or can that termsenum just use this thing?
return null;
}
@Override
public Reader getDocToOrd() {
// nocommit: add this to the codec api!
return null;
}
};
} else {
@ -1206,6 +1203,7 @@ class FieldCacheImpl implements FieldCache {
termCountHardLimit = maxDoc+1;
}
// nocommit use Uninvert?
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
@ -1238,8 +1236,9 @@ class FieldCacheImpl implements FieldCache {
final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);
// 0 is reserved for "unset"
bytes.copyUsingLengthPrefix(new BytesRef());
int termOrd = 1;
int termOrd = 0;
// nocommit use Uninvert?
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
@ -1267,7 +1266,7 @@ class FieldCacheImpl implements FieldCache {
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
docToTermOrd.set(docID, termOrd);
docToTermOrd.set(docID, 1+termOrd);
}
termOrd++;
}

View File

@ -97,35 +97,34 @@ public abstract class FieldCacheRangeFilter<T> extends Filter {
final int inclusiveLowerPoint, inclusiveUpperPoint;
// Hints:
// * binarySearchLookup returns 0, if value was null.
// * binarySearchLookup returns -1, if value was null.
// * the value is <0 if no exact hit was found, the returned value
// is (-(insertion point) - 1)
if (lowerPoint == 0) {
assert lowerVal == null;
inclusiveLowerPoint = 1;
} else if (includeLower && lowerPoint > 0) {
if (lowerPoint == -1 && lowerVal == null) {
inclusiveLowerPoint = 0;
} else if (includeLower && lowerPoint >= 0) {
inclusiveLowerPoint = lowerPoint;
} else if (lowerPoint > 0) {
} else if (lowerPoint >= 0) {
inclusiveLowerPoint = lowerPoint + 1;
} else {
inclusiveLowerPoint = Math.max(1, -lowerPoint - 1);
inclusiveLowerPoint = Math.max(0, -lowerPoint - 1);
}
if (upperPoint == 0) {
assert upperVal == null;
if (upperPoint == -1 && upperVal == null) {
inclusiveUpperPoint = Integer.MAX_VALUE;
} else if (includeUpper && upperPoint > 0) {
} else if (includeUpper && upperPoint >= 0) {
inclusiveUpperPoint = upperPoint;
} else if (upperPoint > 0) {
} else if (upperPoint >= 0) {
inclusiveUpperPoint = upperPoint - 1;
} else {
inclusiveUpperPoint = -upperPoint - 2;
}
if (inclusiveUpperPoint <= 0 || inclusiveLowerPoint > inclusiveUpperPoint)
if (inclusiveUpperPoint < 0 || inclusiveLowerPoint > inclusiveUpperPoint) {
return DocIdSet.EMPTY_DOCIDSET;
}
assert inclusiveLowerPoint > 0 && inclusiveUpperPoint > 0;
assert inclusiveLowerPoint >= 0 && inclusiveUpperPoint >= 0;
return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
@Override

View File

@ -122,15 +122,21 @@ public class FieldCacheTermsFilter extends Filter {
final FixedBitSet bits = new FixedBitSet(fcsi.numOrd());
final BytesRef spare = new BytesRef();
for (int i=0;i<terms.length;i++) {
int termNumber = fcsi.binarySearchLookup(terms[i], spare);
if (termNumber > 0) {
bits.set(termNumber);
int ord = fcsi.binarySearchLookup(terms[i], spare);
if (ord >= 0) {
bits.set(ord);
}
}
return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
@Override
protected final boolean matchDoc(int doc) {
return bits.get(fcsi.getOrd(doc));
int ord = fcsi.getOrd(doc);
if (ord == -1) {
// missing
return false;
} else {
return bits.get(ord);
}
}
};
}

View File

@ -1171,6 +1171,9 @@ public abstract class FieldComparator<T> {
return docValue.compareTo(value);
}
// nocommit remove null from FC DocTerms/Index as an
// allowed value
/** Base class for specialized (per bit width of the
* ords) per-segment comparator. NOTE: this is messy;
* we do this only because hotspot can't reliably inline
@ -1218,191 +1221,6 @@ public abstract class FieldComparator<T> {
}
}
// Used per-segment when bit width of doc->ord is 8:
private final class ByteOrdComparator extends PerSegmentComparator {
private final byte[] readerOrds;
private final DocTermsIndex termsIndex;
private final int docBase;
public ByteOrdComparator(byte[] readerOrds, DocTermsIndex termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
final int docOrd = (readerOrds[doc]&0xFF);
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc]&0xFF;
ords[slot] = ord;
if (ord == 0) {
values[slot] = null;
} else {
assert ord > 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.lookup(ord, values[slot]);
}
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width of doc->ord is 16:
private final class ShortOrdComparator extends PerSegmentComparator {
private final short[] readerOrds;
private final DocTermsIndex termsIndex;
private final int docBase;
public ShortOrdComparator(short[] readerOrds, DocTermsIndex termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
final int docOrd = (readerOrds[doc]&0xFFFF);
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc]&0xFFFF;
ords[slot] = ord;
if (ord == 0) {
values[slot] = null;
} else {
assert ord > 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.lookup(ord, values[slot]);
}
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width of doc->ord is 32:
private final class IntOrdComparator extends PerSegmentComparator {
private final int[] readerOrds;
private final DocTermsIndex termsIndex;
private final int docBase;
public IntOrdComparator(int[] readerOrds, DocTermsIndex termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
final int docOrd = readerOrds[doc];
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc];
ords[slot] = ord;
if (ord == 0) {
values[slot] = null;
} else {
assert ord > 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.lookup(ord, values[slot]);
}
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width is not a native array
// size (8, 16, 32):
private final class AnyDocToOrdComparator extends PerSegmentComparator {
private final PackedInts.Reader readerOrds;
private final DocTermsIndex termsIndex;
private final int docBase;
public AnyDocToOrdComparator(PackedInts.Reader readerOrds, DocTermsIndex termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
final int docOrd = (int) readerOrds.get(doc);
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) {
final int ord = (int) readerOrds.get(doc);
ords[slot] = ord;
if (ord == 0) {
values[slot] = null;
} else {
assert ord > 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.lookup(ord, values[slot]);
}
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when docToOrd is null:
private final class AnyOrdComparator extends PerSegmentComparator {
private final DocTermsIndex termsIndex;
@ -1416,7 +1234,7 @@ public abstract class FieldComparator<T> {
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
final int docOrd = (int) termsIndex.getOrd(doc);
final int docOrd = termsIndex.getOrd(doc);
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
@ -1432,12 +1250,12 @@ public abstract class FieldComparator<T> {
@Override
public void copy(int slot, int doc) {
final int ord = (int) termsIndex.getOrd(doc);
final int ord = termsIndex.getOrd(doc);
ords[slot] = ord;
if (ord == 0) {
if (ord == -1) {
values[slot] = null;
} else {
assert ord > 0;
assert ord >= 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
@ -1451,29 +1269,7 @@ public abstract class FieldComparator<T> {
public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
final int docBase = context.docBase;
termsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), field);
final PackedInts.Reader docToOrd = termsIndex.getDocToOrd();
FieldComparator<BytesRef> perSegComp = null;
if (docToOrd != null && docToOrd.hasArray()) {
final Object arr = docToOrd.getArray();
if (arr instanceof byte[]) {
perSegComp = new ByteOrdComparator((byte[]) arr, termsIndex, docBase);
} else if (arr instanceof short[]) {
perSegComp = new ShortOrdComparator((short[]) arr, termsIndex, docBase);
} else if (arr instanceof int[]) {
perSegComp = new IntOrdComparator((int[]) arr, termsIndex, docBase);
}
// Don't specialize the long[] case since it's not
// possible, ie, worse case is MAX_INT-1 docs with
// every one having a unique value.
}
if (perSegComp == null) {
if (docToOrd != null) {
perSegComp = new AnyDocToOrdComparator(docToOrd, termsIndex, docBase);
} else {
perSegComp = new AnyOrdComparator(termsIndex, docBase);
}
}
FieldComparator<BytesRef> perSegComp = new AnyOrdComparator(termsIndex, docBase);
currentReaderGen++;
if (bottomSlot != -1) {
perSegComp.setBottom(bottomSlot);
@ -1492,9 +1288,9 @@ public abstract class FieldComparator<T> {
bottomSameReader = true;
} else {
if (bottomValue == null) {
// 0 ord is null for all segments
assert ords[bottomSlot] == 0;
bottomOrd = 0;
// -1 ord is null for all segments
assert ords[bottomSlot] == -1;
bottomOrd = -1;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
@ -2118,7 +1914,7 @@ public abstract class FieldComparator<T> {
}
final protected static int binarySearch(BytesRef br, DocTermsIndex a, BytesRef key) {
return binarySearch(br, a, key, 1, a.numOrd()-1);
return binarySearch(br, a, key, 0, a.numOrd()-1);
}
final protected static int binarySearch(BytesRef br, DocTermsIndex a, BytesRef key, int low, int high) {

View File

@ -494,6 +494,11 @@ public class TestDemoDocValue extends LuceneTestCase {
directory.close();
}
// nocommit tests should fail if a codec makes the [easy]
// mistake of NOT opening all files when SimpleDVProducer
// is created ... frist cut of Lucene41 had this bug but
// no tests failed!?
public void testDemoSortedBytes() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());

View File

@ -1628,9 +1628,9 @@ public class TestIndexWriter extends LuceneTestCase {
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
FieldCache.DocTermsIndex dti = FieldCache.DEFAULT.getTermsIndex(SlowCompositeReaderWrapper.wrap(reader), "content", random().nextFloat() * PackedInts.FAST);
assertEquals(5, dti.numOrd()); // +1 for null ord
assertEquals(4, dti.numOrd());
assertEquals(4, dti.size());
assertEquals(bigTermBytesRef, dti.lookup(3, new BytesRef()));
assertEquals(bigTermBytesRef, dti.lookup(2, new BytesRef()));
reader.close();
dir.close();
}

View File

@ -144,7 +144,7 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
// fill into a OpenBitSet
do {
long ord = termsEnum.ord();
if (ord > 0) {
if (ord >= 0) {
termSet.set(ord);
}
} while (termsEnum.next() != null);
@ -155,7 +155,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
@Override
protected final boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException {
return termSet.get(fcsi.getOrd(doc));
int ord = fcsi.getOrd(doc);
if (ord == -1) {
return false;
}
return termSet.get(ord);
}
};
}

View File

@ -159,7 +159,7 @@ class ElevationComparatorSource extends FieldComparatorSource {
private int docVal(int doc) {
int ord = idIndex.getOrd(doc);
if (ord == 0) {
if (ord == -1) {
return 0;
} else {
BytesRef id = idIndex.lookup(ord, tempBR);

View File

@ -203,7 +203,7 @@ public class TestFieldCache extends LuceneTestCase {
TermsEnum tenum = termsIndex.getTermsEnum();
BytesRef val = new BytesRef();
for (int i=1; i<nTerms; i++) {
for (int i=0; i<nTerms; i++) {
BytesRef val1 = tenum.next();
BytesRef val2 = termsIndex.lookup(i,val);
// System.out.println("i="+i);

View File

@ -211,6 +211,7 @@ public class TestSort extends LuceneTestCase {
writer.addDocument (doc);
}
}
IndexReader reader = writer.getReader();
writer.close ();
IndexSearcher s = newSearcher(reader);
@ -252,6 +253,8 @@ public class TestSort extends LuceneTestCase {
}
String numFixed = getRandomCharString(fixedLen, 48, 52);
// nocommit shouldn't this be tracer_fixed? how is
// this passing?
doc.add (new Field ("fixed_tracer", numFixed, onlyStored));
//doc.add (new Field ("contents", Integer.toString(i), Field.Store.NO, Field.Index.ANALYZED));
doc.add(new StringField("string_fixed", numFixed, Field.Store.NO));
@ -269,11 +272,42 @@ public class TestSort extends LuceneTestCase {
writer.addDocument (doc);
}
// nocommit
//writer.forceMerge(1);
//System.out.println(writer.getSegmentCount());
writer.close();
IndexReader reader = DirectoryReader.open(indexStore);
return newSearcher(reader);
IndexSearcher searcher = newSearcher(reader);
/*
for(int docID=0;docID<reader.maxDoc();docID++) {
StoredDocument doc = reader.document(docID);
String s = doc.get("tracer");
TopDocs hits = searcher.search(new TermQuery(new Term("string", s)), NUM_STRINGS);
System.out.println("string=" + s + " has " + hits.totalHits + " docs");
boolean found = false;
for(int hit=0;!found && hit<hits.totalHits;hit++) {
if (hits.scoreDocs[hit].doc == docID) {
found = true;
break;
}
}
assertTrue(found);
s = doc.get("tracer2");
hits = searcher.search(new TermQuery(new Term("string2", s)), NUM_STRINGS);
System.out.println("string2=" + s + " has " + hits.totalHits + " docs");
found = false;
for(int hit=0;!found && hit<hits.totalHits;hit++) {
if (hits.scoreDocs[hit].doc == docID) {
found = true;
break;
}
}
assertTrue(found);
}
*/
return searcher;
}
public String getRandomNumberString(int num, int low, int high) {
@ -533,7 +567,7 @@ public class TestSort extends LuceneTestCase {
StorableField[] v = doc2.getFields("tracer" + fieldSuffix);
StorableField[] v2 = doc2.getFields("tracer2" + fieldSuffix);
for (int j = 0; j < v.length; ++j) {
buff.append(v[j] + "(" + v2[j] + ")(" + result[x].doc+")\n");
buff.append(v[j].stringValue() + "(" + v2[j].stringValue() + ")(" + result[x].doc+")\n");
if (last != null) {
int cmp = v[j].stringValue().compareTo(last);
if (!(cmp >= 0)) { // ensure first field is in order