From 8a9b033349cdfb515cd1772acd15ee8e9c08fd7c Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 13 Mar 2016 08:55:31 -0400 Subject: [PATCH] optimize offline -> offline partition --- .../org/apache/lucene/util/bkd/BKDWriter.java | 19 +---- .../lucene/util/bkd/HeapPointReader.java | 4 +- .../lucene/util/bkd/OfflinePointReader.java | 77 ++++++++++++++++++- .../lucene/util/bkd/OfflinePointWriter.java | 2 +- .../apache/lucene/util/bkd/PointReader.java | 37 +++++++-- 5 files changed, 111 insertions(+), 28 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index d4e30b722e6..c5cdc303195 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -1176,24 +1176,7 @@ public class BKDWriter implements Closeable { PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim); PointReader reader = slices[dim].writer.getReader(slices[dim].start);) { - // Partition this source according to how the splitDim split the values: - long nextRightCount = 0; - for (long i=0;i blocks; final int valuesPerBlock; diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index 3c4b8b5ddd7..c8ab47e1e29 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -22,9 +22,11 @@ import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.LongBitSet; /** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}. */ -final class OfflinePointReader implements PointReader { +final class OfflinePointReader extends PointReader { long countLeft; private final IndexInput in; private final byte[] packedValue; @@ -90,5 +92,78 @@ final class OfflinePointReader implements PointReader { public void close() throws IOException { in.close(); } + + @Override + public long split(long count, LongBitSet rightTree, PointWriter left, PointWriter right, boolean doClearBits) throws IOException { + + if (left instanceof OfflinePointWriter == false || + right instanceof OfflinePointWriter == false) { + return super.split(count, rightTree, left, right, doClearBits); + } + + // We specialize the offline -> offline split since the default impl + // is somewhat wasteful otherwise (e.g. decoding docID when we don't + // need to) + + int packedBytesLength = packedValue.length; + + int bytesPerDoc = packedBytesLength + Integer.BYTES; + if (longOrds) { + bytesPerDoc += Long.BYTES; + } else { + bytesPerDoc += Integer.BYTES; + } + + long rightCount = 0; + + IndexOutput rightOut = ((OfflinePointWriter) right).out; + IndexOutput leftOut = ((OfflinePointWriter) left).out; + + ((OfflinePointWriter) right).count = count; + ((OfflinePointWriter) left).count = count; + + assert count <= countLeft: "count=" + count + " countLeft=" + countLeft; + + countLeft -= count; + + byte[] buffer = new byte[bytesPerDoc]; + while (count > 0) { + in.readBytes(buffer, 0, buffer.length); + long ord; + if (longOrds) { + ord = readLong(buffer, packedBytesLength); + } else { + ord = readInt(buffer, packedBytesLength); + } + if (rightTree.get(ord)) { + rightOut.writeBytes(buffer, 0, bytesPerDoc); + if (doClearBits) { + rightTree.clear(ord); + } + rightCount++; + } else { + leftOut.writeBytes(buffer, 0, bytesPerDoc); + } + + count--; + } + + return rightCount; + } + + // Poached from ByteArrayDataInput: + private static long readLong(byte[] bytes, int pos) { + final int i1 = ((bytes[pos++] & 0xff) << 24) | ((bytes[pos++] & 0xff) << 16) | + ((bytes[pos++] & 0xff) << 8) | (bytes[pos++] & 0xff); + final int i2 = ((bytes[pos++] & 0xff) << 24) | ((bytes[pos++] & 0xff) << 16) | + ((bytes[pos++] & 0xff) << 8) | (bytes[pos++] & 0xff); + return (((long)i1) << 32) | (i2 & 0xFFFFFFFFL); + } + + // Poached from ByteArrayDataInput: + private static int readInt(byte[] bytes, int pos) { + return ((bytes[pos++] & 0xFF) << 24) | ((bytes[pos++] & 0xFF) << 16) + | ((bytes[pos++] & 0xFF) << 8) | (bytes[pos++] & 0xFF); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java index 5aa11de8977..f9580503b01 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java @@ -28,7 +28,7 @@ final class OfflinePointWriter implements PointWriter { final Directory tempDir; final IndexOutput out; final int packedBytesLength; - private long count; + long count; private boolean closed; // true if ords are written as long (8 bytes), else 4 bytes private boolean longOrds; diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java index fe7a9612426..1919f584f47 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java @@ -20,21 +20,48 @@ package org.apache.lucene.util.bkd; import java.io.Closeable; import java.io.IOException; +import org.apache.lucene.util.LongBitSet; + /** One pass iterator through all points previously written with a * {@link PointWriter}, abstracting away whether points a read * from (offline) disk or simple arrays in heap. */ -interface PointReader extends Closeable { +abstract class PointReader implements Closeable { /** Returns false once iteration is done, else true. */ - boolean next() throws IOException; + abstract boolean next() throws IOException; /** Returns the packed byte[] value */ - byte[] packedValue(); + abstract byte[] packedValue(); /** Point ordinal */ - long ord(); + abstract long ord(); /** DocID for this point */ - int docID(); + abstract int docID(); + + /** Splits this reader into left and right partitions */ + public long split(long count, LongBitSet rightTree, PointWriter left, PointWriter right, boolean doClearBits) throws IOException { + + // Partition this source according to how the splitDim split the values: + long rightCount = 0; + for (long i=0;i