From a5b941baa1aa907886760a247227ecf2cbc3eaa1 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 12 Mar 2016 05:11:51 -0500 Subject: [PATCH] LUCENE-7098: reduce OfflineSorter and BKDWriter IO by using 4 bytes instead of 8 bytes to encord ord in the common case Squashed commit of the following: commit 5ac2dcf2a972e46ccda3e7a8d8df5d0af58f712a Merge: 68acf7f 684b222 Author: Mike McCandless Date: Fri Mar 11 19:04:13 2016 -0500 Merge branch 'master' into intords commit 68acf7f9ee2e0249d90075bc035721c0a91619f7 Author: Mike McCandless Date: Fri Mar 11 19:04:01 2016 -0500 rename to totalPointCount and add comment; enforce that the caller doesn't exceed what they said; simplify the longOrds check to just compare to Integer.MAX_VALUE commit afc964b56015475e8c354fdc8b0e05c7fa074ec2 Merge: db79e36 fe21f7a Author: Mike McCandless Date: Fri Mar 11 10:17:09 2016 -0500 Merge branch 'master' into intords Conflicts: lucene/core/src/test/org/apache/lucene/index/Test2BPoints.java commit db79e365e097153a05813eaa70603c601bce1853 Author: Mike McCandless Date: Fri Mar 11 10:15:05 2016 -0500 use int (4 bytes) not lon (8 bytes) if the number of points is less than ~2.1B --- .../simpletext/SimpleTextPointsWriter.java | 3 +- .../apache/lucene/codecs/PointsWriter.java | 14 +++- .../codecs/lucene60/Lucene60PointsWriter.java | 20 ++++- .../lucene/index/PointValuesWriter.java | 16 ++-- .../org/apache/lucene/util/bkd/BKDWriter.java | 83 ++++++++++++------- .../lucene/util/bkd/HeapPointReader.java | 12 ++- .../lucene/util/bkd/HeapPointWriter.java | 37 +++++++-- .../lucene/util/bkd/OfflinePointReader.java | 24 ++++-- .../lucene/util/bkd/OfflinePointWriter.java | 17 +++- .../org/apache/lucene/util/bkd/TestBKD.java | 22 +++-- 10 files changed, 178 insertions(+), 70 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index d2b848d262c..13494f5b348 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -76,7 +76,8 @@ class SimpleTextPointsWriter extends PointsWriter { fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, - BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP) { + BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, + values.size(fieldInfo.name)) { @Override protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java index 53db281777a..000d713fa9a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java @@ -41,6 +41,17 @@ public abstract class PointsWriter implements Closeable { * from the incoming segment. The default codec overrides this for 1D fields and uses * a faster but more complex implementation. */ protected void mergeOneField(MergeState mergeState, FieldInfo fieldInfo) throws IOException { + long maxPointCount = 0; + for (int i=0;i Integer.MAX_VALUE; + + // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int) + if (longOrds) { + bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES; + } else { + bytesPerDoc = packedBytesLength + Integer.BYTES + Integer.BYTES; + } // As we recurse, we compute temporary partitions of the data, halving the // number of points at each recursion. Once there are few enough points, @@ -173,12 +187,12 @@ public class BKDWriter implements Closeable { } // We write first maxPointsSortInHeap in heap, then cutover to offline for additional points: - heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength); + heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength, longOrds); this.maxMBSortInHeap = maxMBSortInHeap; } - public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap) { + public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) { // We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use // remaining 4 bits for another purpose later if (numDims < 1 || numDims > MAX_DIMS) { @@ -193,13 +207,16 @@ public class BKDWriter implements Closeable { if (maxMBSortInHeap < 0.0) { throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")"); } + if (totalPointCount < 0) { + throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")"); + } } /** If the current segment has too many points then we switchover to temp files / offline sort. */ private void switchToOffline() throws IOException { // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree: - offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength); + offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds); tempInput = offlinePointWriter.out; PointReader reader = heapPointWriter.getReader(0); for(int i=0;i totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); + } docsSeen.set(docID); } @@ -437,6 +457,9 @@ public class BKDWriter implements Closeable { assert numDims > 1 || valueInOrder(valueCount, lastPackedValue, reader.state.scratchPackedValue); valueCount++; + if (pointCount > totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); + } if (leafCount == 0) { if (leafBlockFPs.size() > 0) { @@ -569,13 +592,10 @@ public class BKDWriter implements Closeable { new IntroSorter() { private final byte[] pivotPackedValue = new byte[bytesPerDim]; private int pivotDocID; - private long pivotOrd; @Override protected void setPivot(int i) { pivotDocID = writer.docIDs[i]; - pivotOrd = writer.ords[i]; - int block = i / writer.valuesPerBlock; int index = i % writer.valuesPerBlock; System.arraycopy(writer.blocks.get(block), index*packedBytesLength+dim*bytesPerDim, pivotPackedValue, 0, bytesPerDim); @@ -593,12 +613,7 @@ public class BKDWriter implements Closeable { } // Tie-break - cmp = Integer.compare(pivotDocID, writer.docIDs[j]); - if (cmp != 0) { - return cmp; - } - - return Long.compare(pivotOrd, writer.ords[j]); + return Integer.compare(pivotDocID, writer.docIDs[j]); } @Override @@ -607,9 +622,15 @@ public class BKDWriter implements Closeable { writer.docIDs[i] = writer.docIDs[j]; writer.docIDs[j] = docID; - long ord = writer.ords[i]; - writer.ords[i] = writer.ords[j]; - writer.ords[j] = ord; + if (longOrds) { + long ord = writer.ordsLong[i]; + writer.ordsLong[i] = writer.ordsLong[j]; + writer.ordsLong[j] = ord; + } else { + int ord = writer.ords[i]; + writer.ords[i] = writer.ords[j]; + writer.ords[j] = ord; + } byte[] blockI = writer.blocks.get(i / writer.valuesPerBlock); int indexI = (i % writer.valuesPerBlock) * packedBytesLength; @@ -660,7 +681,7 @@ public class BKDWriter implements Closeable { sorted = heapPointWriter; } else { // Subsequent dims need a private copy - sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength); + sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds); sorted.copyFrom(heapPointWriter); } @@ -691,10 +712,16 @@ public class BKDWriter implements Closeable { } // Tie-break by docID: - reader.reset(a.bytes, a.offset + packedBytesLength + Long.BYTES, a.length); + int offset; + if (longOrds) { + offset = Long.BYTES; + } else { + offset = Integer.BYTES; + } + reader.reset(a.bytes, a.offset + packedBytesLength + offset, a.length); final int docIDA = reader.readInt(); - reader.reset(b.bytes, b.offset + packedBytesLength + Long.BYTES, b.length); + reader.reset(b.bytes, b.offset + packedBytesLength + offset, b.length); final int docIDB = reader.readInt(); // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it @@ -746,7 +773,7 @@ public class BKDWriter implements Closeable { assert lastWriter[0] != null; - return new OfflinePointWriter(tempDir, lastWriter[0], packedBytesLength, pointCount); + return new OfflinePointWriter(tempDir, lastWriter[0], packedBytesLength, pointCount, longOrds); } } @@ -1005,7 +1032,7 @@ public class BKDWriter implements Closeable { private PathSlice switchToHeap(PathSlice source) throws IOException { int count = Math.toIntExact(source.count); try ( - PointWriter writer = new HeapPointWriter(count, count, packedBytesLength); + PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds); PointReader reader = source.writer.getReader(source.start); ) { for(int i=0;i blocks; final int valuesPerBlock; final int packedBytesLength; - final long[] ords; + final long[] ordsLong; + final int[] ords; final int[] docIDs; final int end; final byte[] scratch; - HeapPointReader(List blocks, int valuesPerBlock, int packedBytesLength, long[] ords, int[] docIDs, int start, int end) { + HeapPointReader(List blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end) { this.blocks = blocks; this.valuesPerBlock = valuesPerBlock; this.ords = ords; + this.ordsLong = ordsLong; this.docIDs = docIDs; curRead = start-1; this.end = end; @@ -76,7 +78,11 @@ final class HeapPointReader implements PointReader { @Override public long ord() { - return ords[curRead]; + if (ordsLong != null) { + return ordsLong[curRead]; + } else { + return ords[curRead]; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index 02361871c4c..3b043d0ace0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -23,7 +23,8 @@ import org.apache.lucene.util.ArrayUtil; final class HeapPointWriter implements PointWriter { int[] docIDs; - long[] ords; + long[] ordsLong; + int[] ords; private int nextWrite; private boolean closed; final int maxSize; @@ -32,11 +33,15 @@ final class HeapPointWriter implements PointWriter { // NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap final List blocks = new ArrayList<>(); - public HeapPointWriter(int initSize, int maxSize, int packedBytesLength) { + public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds) { docIDs = new int[initSize]; - ords = new long[initSize]; this.maxSize = maxSize; this.packedBytesLength = packedBytesLength; + if (longOrds) { + this.ordsLong = new long[initSize]; + } else { + this.ords = new int[initSize]; + } // 4K per page, unless each value is > 4K: valuesPerBlock = Math.max(1, 4096/packedBytesLength); } @@ -46,7 +51,14 @@ final class HeapPointWriter implements PointWriter { throw new IllegalStateException("docIDs.length=" + docIDs.length + " other.nextWrite=" + other.nextWrite); } System.arraycopy(other.docIDs, 0, docIDs, 0, other.nextWrite); - System.arraycopy(other.ords, 0, ords, 0, other.nextWrite); + if (other.ords != null) { + assert this.ords != null; + System.arraycopy(other.ords, 0, ords, 0, other.nextWrite); + } else { + assert this.ordsLong != null; + System.arraycopy(other.ordsLong, 0, ordsLong, 0, other.nextWrite); + } + for(byte[] block : other.blocks) { blocks.add(block.clone()); } @@ -91,21 +103,30 @@ final class HeapPointWriter implements PointWriter { public void append(byte[] packedValue, long ord, int docID) { assert closed == false; assert packedValue.length == packedBytesLength; - if (ords.length == nextWrite) { + if (docIDs.length == nextWrite) { int nextSize = Math.min(maxSize, ArrayUtil.oversize(nextWrite+1, Integer.BYTES)); assert nextSize > nextWrite: "nextSize=" + nextSize + " vs nextWrite=" + nextWrite; - ords = growExact(ords, nextSize); docIDs = growExact(docIDs, nextSize); + if (ordsLong != null) { + ordsLong = growExact(ordsLong, nextSize); + } else { + ords = growExact(ords, nextSize); + } } writePackedValue(nextWrite, packedValue); - ords[nextWrite] = ord; + if (ordsLong != null) { + ordsLong[nextWrite] = ord; + } else { + assert ord <= Integer.MAX_VALUE; + ords[nextWrite] = (int) ord; + } docIDs[nextWrite] = docID; nextWrite++; } @Override public PointReader getReader(long start) { - return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, docIDs, (int) start, nextWrite); + return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index 83d863ba356..3c4b8b5ddd7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -30,18 +30,22 @@ final class OfflinePointReader implements PointReader { private final byte[] packedValue; private long ord; private int docID; + // true if ords are written as long (8 bytes), else 4 bytes + private boolean longOrds; - OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length) throws IOException { - this(tempDir.openInput(tempFileName, IOContext.READONCE), packedBytesLength, start, length); - } - - private OfflinePointReader(IndexInput in, int packedBytesLength, long start, long length) throws IOException { - this.in = in; - int bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES; + OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length, boolean longOrds) throws IOException { + in = tempDir.openInput(tempFileName, IOContext.READONCE); + int bytesPerDoc = packedBytesLength + Integer.BYTES; + if (longOrds) { + bytesPerDoc += Long.BYTES; + } else { + bytesPerDoc += Integer.BYTES; + } long seekFP = start * bytesPerDoc; in.seek(seekFP); this.countLeft = length; packedValue = new byte[packedBytesLength]; + this.longOrds = longOrds; } @Override @@ -58,7 +62,11 @@ final class OfflinePointReader implements PointReader { assert countLeft == -1; return false; } - ord = in.readLong(); + if (longOrds) { + ord = in.readLong(); + } else { + ord = in.readInt(); + } docID = in.readInt(); return true; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java index 625e6fa2b13..dcf678169fd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java @@ -30,27 +30,36 @@ final class OfflinePointWriter implements PointWriter { final int packedBytesLength; private long count; private boolean closed; + // true if ords are written as long (8 bytes), else 4 bytes + private boolean longOrds; - public OfflinePointWriter(Directory tempDir, String tempFileNamePrefix, int packedBytesLength) throws IOException { + public OfflinePointWriter(Directory tempDir, String tempFileNamePrefix, int packedBytesLength, boolean longOrds) throws IOException { this.out = tempDir.createTempOutput(tempFileNamePrefix, "bkd", IOContext.DEFAULT); this.tempDir = tempDir; this.packedBytesLength = packedBytesLength; + this.longOrds = longOrds; } /** Initializes on an already written/closed file, just so consumers can use {@link #getReader} to read the file. */ - public OfflinePointWriter(Directory tempDir, IndexOutput out, int packedBytesLength, long count) { + public OfflinePointWriter(Directory tempDir, IndexOutput out, int packedBytesLength, long count, boolean longOrds) { this.out = out; this.tempDir = tempDir; this.packedBytesLength = packedBytesLength; this.count = count; closed = true; + this.longOrds = longOrds; } @Override public void append(byte[] packedValue, long ord, int docID) throws IOException { assert packedValue.length == packedBytesLength; out.writeBytes(packedValue, 0, packedValue.length); - out.writeLong(ord); + if (longOrds) { + out.writeLong(ord); + } else { + assert ord <= Integer.MAX_VALUE; + out.writeInt((int) ord); + } out.writeInt(docID); count++; } @@ -58,7 +67,7 @@ final class OfflinePointWriter implements PointWriter { @Override public PointReader getReader(long start) throws IOException { assert closed; - return new OfflinePointReader(tempDir, out.getName(), packedBytesLength, start, count-start); + return new OfflinePointReader(tempDir, out.getName(), packedBytesLength, start, count-start, longOrds); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index f1402fc277f..20177438ac3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -40,9 +40,17 @@ import org.apache.lucene.util.TestUtil; public class TestBKD extends LuceneTestCase { + private long randomPointCount() { + if (random().nextBoolean()) { + return random().nextInt(Integer.MAX_VALUE); + } else { + return random().nextLong() & Long.MAX_VALUE; + } + } + public void testBasicInts1D() throws Exception { try (Directory dir = getDirectory(100)) { - BKDWriter w = new BKDWriter(100, dir, "tmp", 1, 4, 2, 1.0f); + BKDWriter w = new BKDWriter(100, dir, "tmp", 1, 4, 2, 1.0f, randomPointCount()); byte[] scratch = new byte[4]; for(int docID=0;docID<100;docID++) { NumericUtils.intToSortableBytes(docID, scratch, 0); @@ -117,7 +125,7 @@ public class TestBKD extends LuceneTestCase { int numDims = TestUtil.nextInt(random(), 1, 5); int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100); float maxMB = (float) 3.0 + (3*random().nextFloat()); - BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB); + BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB, randomPointCount()); if (VERBOSE) { System.out.println("TEST: numDims=" + numDims + " numDocs=" + numDocs); @@ -258,7 +266,7 @@ public class TestBKD extends LuceneTestCase { int numDims = TestUtil.nextInt(random(), 1, 5); int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100); float maxMB = (float) 3.0 + (3*random().nextFloat()); - BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); + BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, randomPointCount()); BigInteger[][] docs = new BigInteger[numDocs][]; byte[] scratch = new byte[numBytesPerDim*numDims]; @@ -431,7 +439,7 @@ public class TestBKD extends LuceneTestCase { public void testTooLittleHeap() throws Exception { try (Directory dir = getDirectory(0)) { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { - new BKDWriter(1, dir, "bkd", 1, 16, 1000000, 0.001); + new BKDWriter(1, dir, "bkd", 1, 16, 1000000, 0.001, randomPointCount()); }); assertTrue(expected.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode")); } @@ -554,7 +562,7 @@ public class TestBKD extends LuceneTestCase { List docIDBases = null; int seg = 0; - BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); + BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, randomPointCount()); IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); IndexInput in = null; @@ -608,7 +616,7 @@ public class TestBKD extends LuceneTestCase { seg++; maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000); maxMB = (float) 3.0 + (3*random().nextDouble()); - w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); + w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, randomPointCount()); lastDocIDBase = docID; } } @@ -623,7 +631,7 @@ public class TestBKD extends LuceneTestCase { out.close(); in = dir.openInput("bkd", IOContext.DEFAULT); seg++; - w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); + w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, randomPointCount()); List readers = new ArrayList<>(); for(long fp : toMerge) { in.seek(fp);