also add PointValues.getDocCount stat, and check it in CheckIndex

This commit is contained in:
Mike McCandless 2016-03-02 18:39:57 -05:00
parent 85dbdb7659
commit b5475d10e1
17 changed files with 130 additions and 25 deletions

View File

@ -34,8 +34,8 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.BLOCK_VA
class SimpleTextBKDReader extends BKDReader {
public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
byte[] minPackedValue, byte[] maxPackedValue, long pointCount) throws IOException {
super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount);
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount);
}
@Override

View File

@ -39,6 +39,7 @@ import org.apache.lucene.util.bkd.BKDReader;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.BLOCK_FP;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.BYTES_PER_DIM;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.DOC_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.FIELD_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.FIELD_FP;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointWriter.FIELD_FP_NAME;
@ -124,6 +125,10 @@ class SimpleTextPointReader extends PointReader {
readLine(dataIn);
assert startsWith(POINT_COUNT);
long pointCount = parseLong(POINT_COUNT);
readLine(dataIn);
assert startsWith(DOC_COUNT);
int docCount = parseInt(DOC_COUNT);
long[] leafBlockFPs = new long[count];
for(int i=0;i<count;i++) {
@ -144,7 +149,7 @@ class SimpleTextPointReader extends PointReader {
System.arraycopy(br.bytes, br.offset, splitPackedValues, (1 + bytesPerDim) * i + 1, bytesPerDim);
}
return new SimpleTextBKDReader(dataIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minValue.bytes, maxValue.bytes, pointCount);
return new SimpleTextBKDReader(dataIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minValue.bytes, maxValue.bytes, pointCount, docCount);
}
private void readLine(IndexInput in) throws IOException {
@ -283,4 +288,15 @@ class SimpleTextPointReader extends PointReader {
}
return bkdReader.getPointCount();
}
@Override
public int getDocCount(String fieldName) {
BKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
return 0;
}
return bkdReader.getDocCount();
}
}

View File

@ -52,6 +52,7 @@ class SimpleTextPointWriter extends PointWriter {
final static BytesRef MIN_VALUE = new BytesRef("min value ");
final static BytesRef MAX_VALUE = new BytesRef("max value ");
final static BytesRef POINT_COUNT = new BytesRef("point count ");
final static BytesRef DOC_COUNT = new BytesRef("doc count ");
private IndexOutput dataOut;
final BytesRefBuilder scratch = new BytesRefBuilder();
@ -68,12 +69,13 @@ class SimpleTextPointWriter extends PointWriter {
public void writeField(FieldInfo fieldInfo, PointReader values) throws IOException {
// We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
try (BKDWriter writer = new BKDWriter(writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP) {
try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP) {
@Override
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
@ -107,6 +109,10 @@ class SimpleTextPointWriter extends PointWriter {
writeLong(out, pointCount);
newline(out);
write(out, DOC_COUNT);
writeInt(out, docsSeen.cardinality());
newline(out);
for(int i=0;i<leafBlockFPs.length;i++) {
write(out, BLOCK_FP);
writeLong(out, leafBlockFPs[i]);

View File

@ -100,6 +100,11 @@ public abstract class PointFormat {
public long size(String fieldName) {
throw new IllegalArgumentException("field=\"" + fieldName + "\" was not indexed with points");
}
@Override
public int getDocCount(String fieldName) {
throw new IllegalArgumentException("field=\"" + fieldName + "\" was not indexed with points");
}
};
}
};

View File

@ -121,6 +121,10 @@ public abstract class PointWriter implements Closeable {
throw new UnsupportedOperationException();
}
@Override
public int getDocCount(String fieldName) {
throw new UnsupportedOperationException();
}
});
}

View File

@ -226,5 +226,16 @@ public class Lucene60PointReader extends PointReader implements Closeable {
}
return bkdReader.getPointCount();
}
@Override
public int getDocCount(String fieldName) {
BKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this point field were deleted in this segment:
return 0;
}
return bkdReader.getDocCount();
}
}

View File

@ -82,7 +82,8 @@ public class Lucene60PointWriter extends PointWriter implements Closeable {
@Override
public void writeField(FieldInfo fieldInfo, PointReader values) throws IOException {
try (BKDWriter writer = new BKDWriter(writeState.directory,
try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
@ -129,7 +130,8 @@ public class Lucene60PointWriter extends PointWriter implements Closeable {
// Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
// already sorted incoming segments, instead of trying to sort all points again as if
// we were simply reindexing them:
try (BKDWriter writer = new BKDWriter(writeState.directory,
try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),

View File

@ -1695,6 +1695,7 @@ public final class CheckIndex implements Closeable {
}
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getPointDimensionCount() > 0) {
FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc());
status.totalValueFields++;
int dimCount = fieldInfo.getPointDimensionCount();
int bytesPerDim = fieldInfo.getPointNumBytes();
@ -1709,6 +1710,12 @@ public final class CheckIndex implements Closeable {
byte[] globalMinPackedValue = values.getMinPackedValue(fieldInfo.name);
long size = values.size(fieldInfo.name);
int docCount = values.getDocCount(fieldInfo.name);
if (docCount > size) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount);
}
if (globalMinPackedValue == null) {
if (size != 0) {
throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
@ -1739,6 +1746,7 @@ public final class CheckIndex implements Closeable {
public void visit(int docID, byte[] packedValue) {
checkPackedValue("packed value", packedValue, docID);
pointCountSeen[0]++;
docsSeen.set(docID);
for(int dim=0;dim<dimCount;dim++) {
int offset = bytesPerDim * dim;
@ -1821,9 +1829,12 @@ public final class CheckIndex implements Closeable {
});
if (pointCountSeen[0] != size) {
System.out.println("HERE: " + values);
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]);
}
if (docsSeen.cardinality() != docCount) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality());
}
}
}
}

View File

@ -394,6 +394,19 @@ public class ParallelLeafReader extends LeafReader {
}
return dimValues.size(fieldName);
}
@Override
public int getDocCount(String fieldName) {
LeafReader reader = fieldToReader.get(fieldName);
if (reader == null) {
return 0;
}
PointValues dimValues = reader.getPointValues();
if (dimValues == null) {
return 0;
}
return dimValues.getDocCount(fieldName);
}
};
}

View File

@ -95,5 +95,6 @@ public abstract class PointValues {
/** Returns the total number of indexed points across all documents in this field. */
public abstract long size(String fieldName);
// nocommit make "delete all point docs then force merge" and then check stats test
/** Returns the total number of documents that have indexed at least one point for this field. */
public abstract int getDocCount(String fieldName);
}

View File

@ -113,6 +113,11 @@ class PointValuesWriter {
public long size(String fieldName) {
throw new UnsupportedOperationException();
}
@Override
public int getDocCount(String fieldName) {
throw new UnsupportedOperationException();
}
});
}
}

View File

@ -177,6 +177,11 @@ public final class SlowCodecReaderWrapper {
public long size(String fieldName) {
return values.size(fieldName);
}
@Override
public int getDocCount(String fieldName) {
return values.getDocCount(fieldName);
}
};
}

View File

@ -43,6 +43,7 @@ public class BKDReader implements Accountable {
final byte[] minPackedValue;
final byte[] maxPackedValue;
final long pointCount;
final int docCount;
protected final int packedBytesLength;
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
@ -65,6 +66,7 @@ public class BKDReader implements Accountable {
in.readBytes(maxPackedValue, 0, packedBytesLength);
pointCount = in.readVLong();
docCount = in.readVInt();
splitPackedValues = new byte[(1+bytesPerDim)*numLeaves];
@ -126,7 +128,7 @@ public class BKDReader implements Accountable {
/** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */
protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
byte[] minPackedValue, byte[] maxPackedValue, long pointCount) throws IOException {
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
this.in = in;
this.numDims = numDims;
this.maxPointsInLeafNode = maxPointsInLeafNode;
@ -138,6 +140,7 @@ public class BKDReader implements Accountable {
this.minPackedValue = minPackedValue;
this.maxPackedValue = maxPackedValue;
this.pointCount = pointCount;
this.docCount = docCount;
assert minPackedValue.length == packedBytesLength;
assert maxPackedValue.length == packedBytesLength;
}
@ -436,4 +439,8 @@ public class BKDReader implements Accountable {
public long getPointCount() {
return pointCount;
}
public int getDocCount() {
return docCount;
}
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.LongBitSet;
@ -110,6 +111,8 @@ public class BKDWriter implements Closeable {
final byte[] scratch2;
final int[] commonPrefixLengths;
protected final FixedBitSet docsSeen;
private OfflinePointWriter offlinePointWriter;
private HeapPointWriter heapPointWriter;
@ -125,11 +128,11 @@ public class BKDWriter implements Closeable {
protected long pointCount;
public BKDWriter(Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim) throws IOException {
this(tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP);
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim) throws IOException {
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP);
}
public BKDWriter(Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap);
// We use tracking dir to deal with removing files on exception, so each place that
// creates temp files doesn't need crazy try/finally/sucess logic:
@ -138,6 +141,7 @@ public class BKDWriter implements Closeable {
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.numDims = numDims;
this.bytesPerDim = bytesPerDim;
docsSeen = new FixedBitSet(maxDoc);
packedBytesLength = numDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
@ -239,6 +243,7 @@ public class BKDWriter implements Closeable {
}
pointCount++;
docsSeen.set(docID);
}
/** How many points have been added so far */
@ -420,8 +425,10 @@ public class BKDWriter implements Closeable {
// System.out.println("iter reader=" + reader);
// NOTE: doesn't work with subclasses (e.g. SimpleText!)
leafBlockDocIDs[leafCount] = reader.docIDBase + reader.docID;
int docID = reader.docIDBase + reader.docID;
leafBlockDocIDs[leafCount] = docID;
System.arraycopy(reader.state.scratchPackedValue, 0, leafBlockPackedValues[leafCount], 0, packedBytesLength);
docsSeen.set(docID);
if (valueCount == 0) {
System.arraycopy(reader.state.scratchPackedValue, 0, minPackedValue, 0, packedBytesLength);
@ -862,6 +869,7 @@ public class BKDWriter implements Closeable {
out.writeBytes(maxPackedValue, 0, packedBytesLength);
out.writeVLong(pointCount);
out.writeVInt(docsSeen.cardinality());
// TODO: for 1D case, don't waste the first byte of each split value (it's always 0)

View File

@ -42,7 +42,7 @@ public class TestBKD extends LuceneTestCase {
public void testBasicInts1D() throws Exception {
try (Directory dir = getDirectory(100)) {
BKDWriter w = new BKDWriter(dir, "tmp", 1, 4, 2, 1.0f);
BKDWriter w = new BKDWriter(100, dir, "tmp", 1, 4, 2, 1.0f);
byte[] scratch = new byte[4];
for(int docID=0;docID<100;docID++) {
NumericUtils.intToBytes(docID, scratch, 0);
@ -117,7 +117,7 @@ public class TestBKD extends LuceneTestCase {
int numDims = TestUtil.nextInt(random(), 1, 5);
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
float maxMB = (float) 3.0 + (3*random().nextFloat());
BKDWriter w = new BKDWriter(dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB);
BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB);
if (VERBOSE) {
System.out.println("TEST: numDims=" + numDims + " numDocs=" + numDocs);
@ -258,7 +258,7 @@ public class TestBKD extends LuceneTestCase {
int numDims = TestUtil.nextInt(random(), 1, 5);
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
float maxMB = (float) 3.0 + (3*random().nextFloat());
BKDWriter w = new BKDWriter(dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
BigInteger[][] docs = new BigInteger[numDocs][];
byte[] scratch = new byte[numBytesPerDim*numDims];
@ -431,7 +431,7 @@ public class TestBKD extends LuceneTestCase {
public void testTooLittleHeap() throws Exception {
try (Directory dir = getDirectory(0)) {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new BKDWriter(dir, "bkd", 1, 16, 1000000, 0.001);
new BKDWriter(1, dir, "bkd", 1, 16, 1000000, 0.001);
});
assertTrue(expected.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode"));
}
@ -631,7 +631,7 @@ public class TestBKD extends LuceneTestCase {
List<Integer> docIDBases = null;
int seg = 0;
BKDWriter w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
IndexInput in = null;
@ -685,7 +685,7 @@ public class TestBKD extends LuceneTestCase {
seg++;
maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
maxMB = (float) 3.0 + (3*random().nextDouble());
w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
lastDocIDBase = docID;
}
}
@ -701,7 +701,7 @@ public class TestBKD extends LuceneTestCase {
out.close();
in = dir.openInput("bkd", IOContext.DEFAULT);
seg++;
w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
List<BKDReader> readers = new ArrayList<>();
for(long fp : toMerge) {
in.seek(fp);

View File

@ -221,6 +221,12 @@ public final class AssertingPointFormat extends PointFormat {
// TODO: what to assert?
return in.size(fieldName);
}
@Override
public int getDocCount(String fieldName) {
// TODO: what to assert?
return in.getDocCount(fieldName);
}
}
static class AssertingPointWriter extends PointWriter {

View File

@ -176,5 +176,10 @@ class CrankyPointFormat extends PointFormat {
public long size(String fieldName) {
return delegate.size(fieldName);
}
@Override
public int getDocCount(String fieldName) {
return delegate.getDocCount(fieldName);
}
}
}