From 9bc5058f7d0066e3d3e74a781f5abf9cf2e9c016 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 8 Jan 2016 10:52:15 +0000 Subject: [PATCH] LUCENE-6962: add min/max per dimension to dimensional values git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1723682 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../simpletext/SimpleTextBKDReader.java | 5 +- .../SimpleTextDimensionalReader.java | 77 +++++++++++++++++-- .../SimpleTextDimensionalWriter.java | 12 ++- .../lucene/codecs/DimensionalFormat.java | 20 +++++ .../lucene/codecs/DimensionalWriter.java | 20 +++++ .../lucene60/Lucene60DimensionalReader.java | 67 ++++++++++++++-- .../lucene/index/DimensionalValues.java | 14 +++- .../lucene/index/DimensionalValuesWriter.java | 22 +++++- .../lucene/index/MultiDimensionalValues.java | 72 +++++++++++++++++ .../lucene/index/ParallelLeafReader.java | 52 +++++++++++++ .../lucene/index/SlowCodecReaderWrapper.java | 20 +++++ .../org/apache/lucene/util/bkd/BKDReader.java | 32 +++++++- .../org/apache/lucene/util/bkd/BKDWriter.java | 32 ++++++++ .../lucene/index/TestDimensionalValues.java | 37 +++++++++ .../org/apache/lucene/util/bkd/TestBKD.java | 17 ++++ .../lucene/index/SortingLeafReader.java | 24 +++++- .../asserting/AssertingDimensionalFormat.java | 20 +++++ 18 files changed, 520 insertions(+), 26 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 43174975b24..bb317a3a686 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -55,6 +55,9 @@ New Features * LUCENE-6837: Add N-best output support to JapaneseTokenizer. (Hiroharu Konno via Christian Moen) +* LUCENE-6962: Add per-dimension min/max to dimensional values + (Mike McCandless) + API Changes * LUCENE-3312: The API of oal.document was restructured to diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java index 752154ecfd6..6e073d60a59 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java @@ -33,8 +33,9 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.BL class SimpleTextBKDReader extends BKDReader { - public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { - super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues); + public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues, + byte[] minPackedValue, byte[] maxPackedValue) throws IOException { + super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalReader.java index fbbd6f2e32c..8e0983c8772 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalReader.java @@ -43,6 +43,8 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.FI import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.FIELD_FP_NAME; import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.INDEX_COUNT; import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.MAX_LEAF_POINTS; +import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.MAX_VALUE; +import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.MIN_VALUE; import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.NUM_DIMS; import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.SPLIT_COUNT; import static org.apache.lucene.codecs.simpletext.SimpleTextDimensionalWriter.SPLIT_DIM; @@ -89,6 +91,17 @@ class SimpleTextDimensionalReader extends DimensionalReader { readLine(dataIn); int count = parseInt(INDEX_COUNT); + + readLine(dataIn); + assert startsWith(MIN_VALUE); + BytesRef minValue = SimpleTextUtil.fromBytesRefString(stripPrefix(MIN_VALUE)); + assert minValue.length == numDims*bytesPerDim; + + readLine(dataIn); + assert startsWith(MAX_VALUE); + BytesRef maxValue = SimpleTextUtil.fromBytesRefString(stripPrefix(MAX_VALUE)); + assert maxValue.length == numDims*bytesPerDim; + long[] leafBlockFPs = new long[count]; for(int i=0;i subs; @@ -95,4 +97,74 @@ class MultiDimensionalValues extends DimensionalValues { b.append(')'); return b.toString(); } + + @Override + public byte[] getMinPackedValue(String fieldName) throws IOException { + byte[] result = null; + for(int i=0;i 0) { + System.arraycopy(maxPackedValue, offset, result, offset, bytesPerDim); + } + } + } + } + + return result; + } + + @Override + public int getNumDimensions(String fieldName) throws IOException { + for(int i=0;i 0; leafNodeOffset = numLeaves; + minPackedValue = new byte[packedBytesLength]; + maxPackedValue = new byte[packedBytesLength]; + in.readBytes(minPackedValue, 0, packedBytesLength); + in.readBytes(maxPackedValue, 0, packedBytesLength); + splitPackedValues = new byte[(1+bytesPerDim)*numLeaves]; // TODO: don't write split packed values[0]! @@ -116,8 +123,9 @@ public class BKDReader implements Accountable { this.in = in; } - /** Called by consumers that have their own on-disk format for the index */ - protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { + /** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */ + protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues, + byte[] minPackedValue, byte[] maxPackedValue) throws IOException { this.in = in; this.numDims = numDims; this.maxPointsInLeafNode = maxPointsInLeafNode; @@ -126,6 +134,10 @@ public class BKDReader implements Accountable { this.leafNodeOffset = leafBlockFPs.length; this.leafBlockFPs = leafBlockFPs; this.splitPackedValues = splitPackedValues; + this.minPackedValue = minPackedValue; + this.maxPackedValue = maxPackedValue; + assert minPackedValue.length == packedBytesLength; + assert maxPackedValue.length == packedBytesLength; } private static class VerifyVisitor implements IntersectVisitor { @@ -405,4 +417,20 @@ public class BKDReader implements Accountable { return splitPackedValues.length + leafBlockFPs.length * RamUsageEstimator.NUM_BYTES_LONG; } + + public byte[] getMinPackedValue() { + return minPackedValue.clone(); + } + + public byte[] getMaxPackedValue() { + return maxPackedValue.clone(); + } + + public int getNumDimensions() { + return numDims; + } + + public int getBytesPerDimension() { + return bytesPerDim; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index b8fc3dce80b..e47830db94d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -119,6 +119,12 @@ public class BKDWriter implements Closeable { protected final int maxPointsInLeafNode; private final int maxPointsSortInHeap; + /** Minimum per-dim values, packed */ + protected final byte[] minPackedValue; + + /** Maximum per-dim values, packed */ + protected final byte[] maxPackedValue; + private long pointCount; public BKDWriter(Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim) throws IOException { @@ -142,6 +148,9 @@ public class BKDWriter implements Closeable { scratch2 = new byte[packedBytesLength]; commonPrefixLengths = new int[numDims]; + minPackedValue = new byte[packedBytesLength]; + maxPackedValue = new byte[packedBytesLength]; + // dimensional values (numDims * bytesPerDim) + ord (long) + docID (int) bytesPerDoc = packedBytesLength + RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT; @@ -213,6 +222,22 @@ public class BKDWriter implements Closeable { heapPointWriter.append(packedValue, pointCount, docID); } + // TODO: we could specialize for the 1D case: + if (pointCount == 0) { + System.arraycopy(packedValue, 0, minPackedValue, 0, packedBytesLength); + System.arraycopy(packedValue, 0, maxPackedValue, 0, packedBytesLength); + } else { + for(int dim=0;dim 0) { + System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim); + } + } + } + pointCount++; } @@ -398,6 +423,11 @@ public class BKDWriter implements Closeable { leafBlockDocIDs[leafCount] = reader.docIDBase + reader.docID; System.arraycopy(reader.state.scratchPackedValue, 0, leafBlockPackedValues[leafCount], 0, packedBytesLength); + if (valueCount == 0) { + System.arraycopy(reader.state.scratchPackedValue, 0, minPackedValue, 0, packedBytesLength); + } + System.arraycopy(reader.state.scratchPackedValue, 0, maxPackedValue, 0, packedBytesLength); + assert numDims > 1 || valueInOrder(valueCount++, lastPackedValue, reader.state.scratchPackedValue); if (leafCount == 0) { @@ -836,6 +866,8 @@ public class BKDWriter implements Closeable { assert leafBlockFPs.length > 0; out.writeVInt(leafBlockFPs.length); + out.writeBytes(minPackedValue, 0, packedBytesLength); + out.writeBytes(maxPackedValue, 0, packedBytesLength); // TODO: for 1D case, don't waste the first byte of each split value (it's always 0) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDimensionalValues.java b/lucene/core/src/test/org/apache/lucene/index/TestDimensionalValues.java index 05dd3a985f6..ec1fa8b4baa 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDimensionalValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDimensionalValues.java @@ -20,6 +20,7 @@ package org.apache.lucene.index; import java.io.IOException; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Arrays; import java.util.BitSet; import java.util.List; @@ -46,6 +47,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.TestUtil; // TODO: factor out a BaseTestDimensionFormat @@ -906,6 +908,28 @@ public class TestDimensionalValues extends LuceneTestCase { RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); DirectoryReader r = null; + // Compute actual min/max values: + byte[][] expectedMinValues = new byte[numDims][]; + byte[][] expectedMaxValues = new byte[numDims][]; + for(int ord=0;ord 0) { + System.arraycopy(docValues[ord][dim], 0, expectedMaxValues[dim], 0, numBytesPerDim); + } + } + } + } + // 20% of the time we add into a separate directory, then at some point use // addIndexes to bring the indexed dimensional values to the main directory: Directory saveDir; @@ -1036,6 +1060,19 @@ public class TestDimensionalValues extends LuceneTestCase { NumericDocValues idValues = MultiDocValues.getNumericValues(r, "id"); Bits liveDocs = MultiFields.getLiveDocs(r); + // Verify min/max values are correct: + byte[] minValues = dimValues.getMinPackedValue("field"); + byte[] maxValues = dimValues.getMaxPackedValue("field"); + byte[] scratch = new byte[numBytesPerDim]; + for(int dim=0;dim " + values[dim]); @@ -148,6 +158,13 @@ public class TestBKD extends LuceneTestCase { in.seek(indexFP); BKDReader r = new BKDReader(in); + byte[] minPackedValue = r.getMinPackedValue(); + byte[] maxPackedValue = r.getMaxPackedValue(); + for(int dim=0;dim