LUCENE-7120: re-use readers while recursing in BKDWriter so we get more thorough checksum verification on its temp files

This commit is contained in:
Mike McCandless 2016-03-21 05:49:47 -04:00
parent 5a40ae0305
commit 0e189bca2d
5 changed files with 78 additions and 21 deletions

View File

@ -227,7 +227,7 @@ public class BKDWriter implements Closeable {
private void spillToOffline() throws IOException { private void spillToOffline() throws IOException {
// For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree: // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill", singleValuePerDoc); offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill", 0, singleValuePerDoc);
tempInput = offlinePointWriter.out; tempInput = offlinePointWriter.out;
PointReader reader = heapPointWriter.getReader(0, pointCount); PointReader reader = heapPointWriter.getReader(0, pointCount);
for(int i=0;i<pointCount;i++) { for(int i=0;i<pointCount;i++) {
@ -851,6 +851,9 @@ public class BKDWriter implements Closeable {
// Sort all docs once by each dimension: // Sort all docs once by each dimension:
PathSlice[] sortedPointWriters = new PathSlice[numDims]; PathSlice[] sortedPointWriters = new PathSlice[numDims];
// This is only used on exception; on normal code paths we close all files we opened:
List<Closeable> toCloseHeroically = new ArrayList<>();
boolean success = false; boolean success = false;
try { try {
//long t0 = System.nanoTime(); //long t0 = System.nanoTime();
@ -872,7 +875,8 @@ public class BKDWriter implements Closeable {
ordBitSet, out, ordBitSet, out,
minPackedValue, maxPackedValue, minPackedValue, maxPackedValue,
splitPackedValues, splitPackedValues,
leafBlockFPs); leafBlockFPs,
toCloseHeroically);
for(PathSlice slice : sortedPointWriters) { for(PathSlice slice : sortedPointWriters) {
slice.writer.destroy(); slice.writer.destroy();
@ -886,11 +890,8 @@ public class BKDWriter implements Closeable {
success = true; success = true;
} finally { } finally {
if (success == false) { if (success == false) {
if (tempInput != null) {
IOUtils.closeWhileHandlingException(tempInput);
}
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles()); IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles());
tempInput = null; IOUtils.closeWhileHandlingException(toCloseHeroically);
} }
} }
@ -1010,6 +1011,8 @@ public class BKDWriter implements Closeable {
// Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim: // Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim:
// Read the split value, then mark all ords in the right tree (larger than the split value): // Read the split value, then mark all ords in the right tree (larger than the split value):
// TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) { try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
boolean result = reader.next(); boolean result = reader.next();
assert result; assert result;
@ -1069,12 +1072,11 @@ public class BKDWriter implements Closeable {
} }
/** Pull a partition back into heap once the point count is low enough while recursing. */ /** Pull a partition back into heap once the point count is low enough while recursing. */
private PathSlice switchToHeap(PathSlice source) throws IOException { private PathSlice switchToHeap(PathSlice source, List<Closeable> toCloseHeroically) throws IOException {
int count = Math.toIntExact(source.count); int count = Math.toIntExact(source.count);
try ( // Not inside the try because we don't want to close it here:
PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc); PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically);
PointReader reader = source.writer.getReader(source.start, source.count); try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) {
) {
for(int i=0;i<count;i++) { for(int i=0;i<count;i++) {
boolean hasNext = reader.next(); boolean hasNext = reader.next();
assert hasNext; assert hasNext;
@ -1096,7 +1098,8 @@ public class BKDWriter implements Closeable {
IndexOutput out, IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues, byte[] splitPackedValues,
long[] leafBlockFPs) throws IOException { long[] leafBlockFPs,
List<Closeable> toCloseHeroically) throws IOException {
for(PathSlice slice : slices) { for(PathSlice slice : slices) {
assert slice.count == slices[0].count; assert slice.count == slices[0].count;
@ -1104,7 +1107,7 @@ public class BKDWriter implements Closeable {
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) { if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough: // Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0]); slices[0] = switchToHeap(slices[0], toCloseHeroically);
} }
if (nodeID >= leafNodeOffset) { if (nodeID >= leafNodeOffset) {
@ -1114,7 +1117,7 @@ public class BKDWriter implements Closeable {
if (slices[dim].writer instanceof HeapPointWriter == false) { if (slices[dim].writer instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
// offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
slices[dim] = switchToHeap(slices[dim]); slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
} }
PathSlice source = slices[dim]; PathSlice source = slices[dim];
@ -1212,7 +1215,8 @@ public class BKDWriter implements Closeable {
for(int dim=0;dim<numDims;dim++) { for(int dim=0;dim<numDims;dim++) {
if (dim == splitDim) { if (dim == splitDim) {
// No need to partition on this dim since it's a simple slice of the incoming already sorted slice. // No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
// will re-use its shared reader when visiting it as we recurse:
leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount); leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount); rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim); System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim);
@ -1220,9 +1224,12 @@ public class BKDWriter implements Closeable {
continue; continue;
} }
// Not inside the try because we don't want to close this one now, so that after recursion is done,
// we will have done a singel full sweep of the file:
PointReader reader = slices[dim].writer.getSharedReader(slices[dim].start, slices[dim].count, toCloseHeroically);
try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim); try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim);
PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim); PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim)) {
PointReader reader = slices[dim].writer.getReader(slices[dim].start, slices[dim].count);) {
long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear); long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear);
if (rightCount != nextRightCount) { if (rightCount != nextRightCount) {
@ -1240,7 +1247,7 @@ public class BKDWriter implements Closeable {
build(2*nodeID, leafNodeOffset, leftSlices, build(2*nodeID, leafNodeOffset, leftSlices,
ordBitSet, out, ordBitSet, out,
minPackedValue, maxSplitPackedValue, minPackedValue, maxSplitPackedValue,
splitPackedValues, leafBlockFPs); splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) { for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim: // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) { if (dim != splitDim) {
@ -1253,7 +1260,7 @@ public class BKDWriter implements Closeable {
build(2*nodeID+1, leafNodeOffset, rightSlices, build(2*nodeID+1, leafNodeOffset, rightSlices,
ordBitSet, out, ordBitSet, out,
minSplitPackedValue, maxPackedValue, minSplitPackedValue, maxPackedValue,
splitPackedValues, leafBlockFPs); splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) { for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim: // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) { if (dim != splitDim) {
@ -1277,7 +1284,7 @@ public class BKDWriter implements Closeable {
int size = Math.toIntExact(count); int size = Math.toIntExact(count);
return new HeapPointWriter(size, size, packedBytesLength, longOrds, singleValuePerDoc); return new HeapPointWriter(size, size, packedBytesLength, longOrds, singleValuePerDoc);
} else { } else {
return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, desc, singleValuePerDoc); return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, desc, count, singleValuePerDoc);
} }
} }
} }

View File

@ -16,6 +16,7 @@
*/ */
package org.apache.lucene.util.bkd; package org.apache.lucene.util.bkd;
import java.io.Closeable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -153,6 +154,11 @@ final class HeapPointWriter implements PointWriter {
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc); return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
} }
@Override
public PointReader getSharedReader(long start, long length, List<Closeable> toCloseHeroically) {
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
}
@Override @Override
public void close() { public void close() {
closed = true; closed = true;

View File

@ -39,6 +39,9 @@ final class OfflinePointReader extends PointReader {
private boolean longOrds; private boolean longOrds;
private boolean checked; private boolean checked;
// File name we are reading
final String name;
OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length, OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
boolean longOrds, boolean singleValuePerDoc) throws IOException { boolean longOrds, boolean singleValuePerDoc) throws IOException {
this.singleValuePerDoc = singleValuePerDoc; this.singleValuePerDoc = singleValuePerDoc;
@ -67,6 +70,7 @@ final class OfflinePointReader extends PointReader {
// at another level of the BKDWriter recursion // at another level of the BKDWriter recursion
in = tempDir.openInput(tempFileName, IOContext.READONCE); in = tempDir.openInput(tempFileName, IOContext.READONCE);
} }
name = tempFileName;
long seekFP = start * bytesPerDoc; long seekFP = start * bytesPerDoc;
in.seek(seekFP); in.seek(seekFP);
@ -121,6 +125,7 @@ final class OfflinePointReader extends PointReader {
public void close() throws IOException { public void close() throws IOException {
try { try {
if (countLeft == 0 && in instanceof ChecksumIndexInput && checked == false) { if (countLeft == 0 && in instanceof ChecksumIndexInput && checked == false) {
//System.out.println("NOW CHECK: " + name);
checked = true; checked = true;
CodecUtil.checkFooter((ChecksumIndexInput) in); CodecUtil.checkFooter((ChecksumIndexInput) in);
} }

View File

@ -16,9 +16,12 @@
*/ */
package org.apache.lucene.util.bkd; package org.apache.lucene.util.bkd;
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
@ -34,13 +37,19 @@ final class OfflinePointWriter implements PointWriter {
private boolean closed; private boolean closed;
// true if ords are written as long (8 bytes), else 4 bytes // true if ords are written as long (8 bytes), else 4 bytes
private boolean longOrds; private boolean longOrds;
private OfflinePointReader sharedReader;
private long nextSharedRead;
final long expectedCount;
public OfflinePointWriter(Directory tempDir, String tempFileNamePrefix, int packedBytesLength, boolean longOrds, String desc, boolean singleValuePerDoc) throws IOException { /** Create a new writer with an unknown number of incoming points */
public OfflinePointWriter(Directory tempDir, String tempFileNamePrefix, int packedBytesLength,
boolean longOrds, String desc, long expectedCount, boolean singleValuePerDoc) throws IOException {
this.out = tempDir.createTempOutput(tempFileNamePrefix, "bkd_" + desc, IOContext.DEFAULT); this.out = tempDir.createTempOutput(tempFileNamePrefix, "bkd_" + desc, IOContext.DEFAULT);
this.tempDir = tempDir; this.tempDir = tempDir;
this.packedBytesLength = packedBytesLength; this.packedBytesLength = packedBytesLength;
this.longOrds = longOrds; this.longOrds = longOrds;
this.singleValuePerDoc = singleValuePerDoc; this.singleValuePerDoc = singleValuePerDoc;
this.expectedCount = expectedCount;
} }
/** Initializes on an already written/closed file, just so consumers can use {@link #getReader} to read the file. */ /** Initializes on an already written/closed file, just so consumers can use {@link #getReader} to read the file. */
@ -52,6 +61,7 @@ final class OfflinePointWriter implements PointWriter {
closed = true; closed = true;
this.longOrds = longOrds; this.longOrds = longOrds;
this.singleValuePerDoc = singleValuePerDoc; this.singleValuePerDoc = singleValuePerDoc;
this.expectedCount = 0;
} }
@Override @Override
@ -68,18 +78,37 @@ final class OfflinePointWriter implements PointWriter {
} }
out.writeInt(docID); out.writeInt(docID);
count++; count++;
assert expectedCount == 0 || count <= expectedCount;
} }
@Override @Override
public PointReader getReader(long start, long length) throws IOException { public PointReader getReader(long start, long length) throws IOException {
assert closed; assert closed;
assert start + length <= count: "start=" + start + " length=" + length + " count=" + count; assert start + length <= count: "start=" + start + " length=" + length + " count=" + count;
assert expectedCount == 0 || count == expectedCount;
return new OfflinePointReader(tempDir, out.getName(), packedBytesLength, start, length, longOrds, singleValuePerDoc); return new OfflinePointReader(tempDir, out.getName(), packedBytesLength, start, length, longOrds, singleValuePerDoc);
} }
@Override
public PointReader getSharedReader(long start, long length, List<Closeable> toCloseHeroically) throws IOException {
if (sharedReader == null) {
assert start == 0;
assert length <= count;
sharedReader = new OfflinePointReader(tempDir, out.getName(), packedBytesLength, 0, count, longOrds, singleValuePerDoc);
toCloseHeroically.add(sharedReader);
// Make sure the OfflinePointReader intends to verify its checksum:
assert sharedReader.in instanceof ChecksumIndexInput;
} else {
assert start == nextSharedRead: "start=" + start + " length=" + length + " nextSharedRead=" + nextSharedRead;
}
nextSharedRead += length;
return sharedReader;
}
@Override @Override
public void close() throws IOException { public void close() throws IOException {
if (closed == false) { if (closed == false) {
assert sharedReader == null;
try { try {
CodecUtil.writeFooter(out); CodecUtil.writeFooter(out);
} finally { } finally {
@ -91,6 +120,12 @@ final class OfflinePointWriter implements PointWriter {
@Override @Override
public void destroy() throws IOException { public void destroy() throws IOException {
if (sharedReader != null) {
// At this point, the shared reader should have done a full sweep of the file:
assert nextSharedRead == count;
sharedReader.close();
sharedReader = null;
}
tempDir.deleteFile(out.getName()); tempDir.deleteFile(out.getName());
} }

View File

@ -19,6 +19,7 @@ package org.apache.lucene.util.bkd;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.List;
/** Appends many points, and then at the end provides a {@link PointReader} to iterate /** Appends many points, and then at the end provides a {@link PointReader} to iterate
* those points. This abstracts away whether we write to disk, or use simple arrays * those points. This abstracts away whether we write to disk, or use simple arrays
@ -30,6 +31,9 @@ interface PointWriter extends Closeable {
/** Returns a {@link PointReader} iterator to step through all previously added points */ /** Returns a {@link PointReader} iterator to step through all previously added points */
PointReader getReader(long startPoint, long length) throws IOException; PointReader getReader(long startPoint, long length) throws IOException;
/** Returns the single shared reader, used at multiple times during the recursion, to read previously added points */
PointReader getSharedReader(long startPoint, long length, List<Closeable> toCloseHeroically) throws IOException;
/** Removes any temp files behind this writer */ /** Removes any temp files behind this writer */
void destroy() throws IOException; void destroy() throws IOException;
} }