From 748d483e67be5ca3d343e92a1e2d3134dcc0535d Mon Sep 17 00:00:00 2001 From: iverase Date: Fri, 1 Mar 2019 07:41:16 +0100 Subject: [PATCH] UCENE-8703: Build point writers in the BKD tree only when they are needed --- lucene/CHANGES.txt | 3 + .../simpletext/SimpleTextBKDWriter.java | 95 +++++++------------ .../org/apache/lucene/util/bkd/BKDWriter.java | 78 ++++++--------- .../org/apache/lucene/util/bkd/TestBKD.java | 12 ++- 4 files changed, 78 insertions(+), 110 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5be513e07f5..0eef5b7d449 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -16,6 +16,9 @@ Improvements of byte arrays. In addition a new interface PointValue is added to abstract out the different formats between offline and on-heap writers. (Ignacio Vera) +* LUCENE-8703: Build point writers in the BKD tree only when they are needed. + (Ignacio Vera) + Other * LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java index cc1140f8a29..ff6cbaa9488 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java @@ -129,8 +129,8 @@ final class SimpleTextBKDWriter implements Closeable { protected final FixedBitSet docsSeen; - private OfflinePointWriter offlinePointWriter; - private HeapPointWriter heapPointWriter; + private PointWriter pointWriter; + private boolean finished; private IndexOutput tempInput; protected final int maxPointsInLeafNode; @@ -147,7 +147,6 @@ final class SimpleTextBKDWriter implements Closeable { /** An upper bound on how many points the caller will add (includes deletions) */ private final long totalPointCount; - private final int maxDoc; @@ -187,9 +186,6 @@ final class SimpleTextBKDWriter implements Closeable { throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode"); } - // We write first maxPointsSortInHeap in heap, then cutover to offline for additional points: - heapPointWriter = new HeapPointWriter(maxPointsSortInHeap, packedBytesLength); - this.maxMBSortInHeap = maxMBSortInHeap; } @@ -216,39 +212,22 @@ final class SimpleTextBKDWriter implements Closeable { } } - /** If the current segment has too many points then we spill over to temp files / offline sort. */ - private void spillToOffline() throws IOException { - - // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree: - offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, "spill", 0); - tempInput = offlinePointWriter.out; - for(int i=0;i= maxPointsSortInHeap) { - if (offlinePointWriter == null) { - spillToOffline(); - } - offlinePointWriter.append(packedValue, docID); - } else { - // Not too many points added yet, continue using heap: - heapPointWriter.append(packedValue, docID); + if (pointCount >= totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + (pointCount + 1) + " values"); } - - // TODO: we could specialize for the 1D case: if (pointCount == 0) { + assert pointWriter == null : "Point writer is already initialized"; + //total point count is an estimation but the final point count must be equal or lower to that number. + if (totalPointCount > maxPointsSortInHeap) { + pointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, "spill", 0); + tempInput = ((OfflinePointWriter)pointWriter).out; + } else { + pointWriter = new HeapPointWriter(Math.toIntExact(totalPointCount), packedBytesLength); + } System.arraycopy(packedValue, 0, minPackedValue, 0, packedIndexBytesLength); System.arraycopy(packedValue, 0, maxPackedValue, 0, packedIndexBytesLength); } else { @@ -262,11 +241,8 @@ final class SimpleTextBKDWriter implements Closeable { } } } - + pointWriter.append(packedValue, docID); pointCount++; - if (pointCount > totalPointCount) { - throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); - } docsSeen.set(docID); } @@ -297,12 +273,12 @@ final class SimpleTextBKDWriter implements Closeable { } // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { + if (finished == true) { throw new IllegalStateException("already finished"); } // Mark that we already finished: - heapPointWriter = null; + finished = true; long countPerLeaf = pointCount = values.size(); long innerNodeCount = 1; @@ -394,12 +370,12 @@ final class SimpleTextBKDWriter implements Closeable { } // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { + if (finished == true) { throw new IllegalStateException("already finished"); } // Mark that we already finished: - heapPointWriter = null; + finished = true; this.out = out; @@ -577,24 +553,25 @@ final class SimpleTextBKDWriter implements Closeable { // TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recurse...) // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { - throw new IllegalStateException("already finished"); - } - - BKDRadixSelector.PathSlice writer; - if (offlinePointWriter != null) { - offlinePointWriter.close(); - writer = new BKDRadixSelector.PathSlice(offlinePointWriter, 0, pointCount); - tempInput = null; - } else { - writer = new BKDRadixSelector.PathSlice(heapPointWriter, 0, pointCount); - heapPointWriter = null; - } - if (pointCount == 0) { throw new IllegalStateException("must index at least one point"); } + // Catch user silliness: + if (finished == true) { + throw new IllegalStateException("already finished"); + } + + //mark as finished + finished = true; + + pointWriter.close(); + BKDRadixSelector.PathSlice points = new BKDRadixSelector.PathSlice(pointWriter, 0, pointCount); + //clean up pointers + tempInput = null; + pointWriter = null; + + long countPerLeaf = pointCount; long innerNodeCount = 1; @@ -626,7 +603,7 @@ final class SimpleTextBKDWriter implements Closeable { try { - build(1, numLeaves, writer, out, + build(1, numLeaves, points, out, radixSelector, minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs); @@ -829,8 +806,6 @@ final class SimpleTextBKDWriter implements Closeable { /** Pull a partition back into heap once the point count is low enough while recursing. */ private HeapPointWriter switchToHeap(PointWriter source) throws IOException { int count = Math.toIntExact(source.count()); - // Not inside the try because we don't want to close it here: - try (PointReader reader = source.getReader(0, count); HeapPointWriter writer = new HeapPointWriter(count, packedBytesLength)) { for(int i=0;i maxPointsSortInHeap) { + pointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, "spill", 0); + tempInput = ((OfflinePointWriter)pointWriter).out; + } else { + pointWriter = new HeapPointWriter(Math.toIntExact(totalPointCount), packedBytesLength); } - heapPointWriter = null; } + public void add(byte[] packedValue, int docID) throws IOException { if (packedValue.length != packedBytesLength) { throw new IllegalArgumentException("packedValue should be length=" + packedBytesLength + " (got: " + packedValue.length + ")"); } - - if (pointCount >= maxPointsSortInHeap) { - if (offlinePointWriter == null) { - spillToOffline(); - } - offlinePointWriter.append(packedValue, docID); - } else { - // Not too many points added yet, continue using heap: - heapPointWriter.append(packedValue, docID); + if (pointCount >= totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + (pointCount + 1) + " values"); } - - // TODO: we could specialize for the 1D case: if (pointCount == 0) { + initPointWriter(); System.arraycopy(packedValue, 0, minPackedValue, 0, packedIndexBytesLength); System.arraycopy(packedValue, 0, maxPackedValue, 0, packedIndexBytesLength); } else { @@ -251,11 +240,8 @@ public class BKDWriter implements Closeable { } } } - + pointWriter.append(packedValue, docID); pointCount++; - if (pointCount > totalPointCount) { - throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); - } docsSeen.set(docID); } @@ -399,12 +385,12 @@ public class BKDWriter implements Closeable { } // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { + if (finished == true) { throw new IllegalStateException("already finished"); } // Mark that we already finished: - heapPointWriter = null; + finished = true; long countPerLeaf = pointCount = values.size(); long innerNodeCount = 1; @@ -542,12 +528,12 @@ public class BKDWriter implements Closeable { } // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { + if (finished == true) { throw new IllegalStateException("already finished"); } // Mark that we already finished: - heapPointWriter = null; + finished = true; this.out = out; @@ -740,7 +726,7 @@ public class BKDWriter implements Closeable { // TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recurse...) // Catch user silliness: - if (heapPointWriter == null && tempInput == null) { + if (finished == true) { throw new IllegalStateException("already finished"); } @@ -748,15 +734,15 @@ public class BKDWriter implements Closeable { throw new IllegalStateException("must index at least one point"); } - BKDRadixSelector.PathSlice points; - if (offlinePointWriter != null) { - offlinePointWriter.close(); - points = new BKDRadixSelector.PathSlice(offlinePointWriter, 0, pointCount); - tempInput = null; - } else { - points = new BKDRadixSelector.PathSlice(heapPointWriter, 0, pointCount); - heapPointWriter = null; - } + //mark as finished + finished = true; + + pointWriter.close(); + BKDRadixSelector.PathSlice points = new BKDRadixSelector.PathSlice(pointWriter, 0, pointCount); + //clean up pointers + tempInput = null; + pointWriter = null; + long countPerLeaf = pointCount; long innerNodeCount = 1; @@ -1145,6 +1131,7 @@ public class BKDWriter implements Closeable { @Override public void close() throws IOException { + finished = true; if (tempInput != null) { // NOTE: this should only happen on exception, e.g. caller calls close w/o calling finish: try { @@ -1234,8 +1221,6 @@ public class BKDWriter implements Closeable { /** Pull a partition back into heap once the point count is low enough while recursing. */ private HeapPointWriter switchToHeap(PointWriter source) throws IOException { int count = Math.toIntExact(source.count()); - // Not inside the try because we don't want to close it here: - try (PointReader reader = source.getReader(0, source.count()); HeapPointWriter writer = new HeapPointWriter(count, packedBytesLength)) { for(int i=0;i toMerge = null; List docMaps = null; int seg = 0; - - BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDataDims, numIndexDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length); + //we force sometimes to provide a bigger point count + long maxDocs = Long.MIN_VALUE; + if (random().nextBoolean()) { + maxDocs = docValues.length; + } else { + while (maxDocs < docValues.length) { + maxDocs = random().nextLong(); + } + } + BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDataDims, numIndexDims, numBytesPerDim, maxPointsInLeafNode, maxMB, maxDocs); IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); IndexInput in = null;