mirror of https://github.com/apache/lucene.git
LUCENE-7371: Better compression of values in Lucene60PointsFormat.
This commit is contained in:
parent
e92a38af90
commit
866398bea6
|
@ -117,6 +117,9 @@ Optimizations
|
||||||
|
|
||||||
* LUCENE-7351: Doc id compression for points. (Adrien Grand)
|
* LUCENE-7351: Doc id compression for points. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-7351: Point values are now better compressed using run-length
|
||||||
|
encoding. (Adrien Grand)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
|
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.function.IntFunction;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.PointsReader;
|
import org.apache.lucene.codecs.PointsReader;
|
||||||
import org.apache.lucene.codecs.PointsWriter;
|
import org.apache.lucene.codecs.PointsWriter;
|
||||||
|
@ -161,12 +162,15 @@ class SimpleTextPointsWriter extends PointsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int bytesOffset) throws IOException {
|
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
for (int i = 0; i < count; ++i) {
|
||||||
write(out, BLOCK_VALUE);
|
BytesRef packedValue = packedValues.apply(i);
|
||||||
write(out, new BytesRef(bytes, bytesOffset, packedBytesLength).toString());
|
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||||
newline(out);
|
write(out, BLOCK_VALUE);
|
||||||
}
|
write(out, packedValue.toString());
|
||||||
|
newline(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
}) {
|
}) {
|
||||||
|
|
||||||
values.intersect(fieldInfo.name, new IntersectVisitor() {
|
values.intersect(fieldInfo.name, new IntersectVisitor() {
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
import org.apache.lucene.index.PointValues.Relation;
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
@ -345,6 +346,63 @@ public class BKDReader implements Accountable {
|
||||||
|
|
||||||
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||||
visitor.grow(count);
|
visitor.grow(count);
|
||||||
|
|
||||||
|
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
|
||||||
|
|
||||||
|
int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES
|
||||||
|
? -1
|
||||||
|
: readCompressedDim(in);
|
||||||
|
|
||||||
|
if (compressedDim == -1) {
|
||||||
|
visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
|
||||||
|
} else {
|
||||||
|
visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Just read suffixes for every dimension
|
||||||
|
private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int prefix = commonPrefixLengths[dim];
|
||||||
|
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
||||||
|
}
|
||||||
|
visitor.visit(docIDs[i], scratchPackedValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
|
||||||
|
// the byte at `compressedByteOffset` is compressed using run-length compression,
|
||||||
|
// other suffix bytes are stored verbatim
|
||||||
|
final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
|
||||||
|
commonPrefixLengths[compressedDim]++;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < count; ) {
|
||||||
|
scratchPackedValue[compressedByteOffset] = in.readByte();
|
||||||
|
final int runLen = Byte.toUnsignedInt(in.readByte());
|
||||||
|
for (int j = 0; j < runLen; ++j) {
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int prefix = commonPrefixLengths[dim];
|
||||||
|
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
||||||
|
}
|
||||||
|
visitor.visit(docIDs[i+j], scratchPackedValue);
|
||||||
|
}
|
||||||
|
i += runLen;
|
||||||
|
}
|
||||||
|
if (i != count) {
|
||||||
|
throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readCompressedDim(IndexInput in) throws IOException {
|
||||||
|
int compressedDim = in.readByte();
|
||||||
|
if (compressedDim < -1 || compressedDim >= numDims) {
|
||||||
|
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
|
||||||
|
}
|
||||||
|
return compressedDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
int prefix = in.readVInt();
|
int prefix = in.readVInt();
|
||||||
commonPrefixLengths[dim] = prefix;
|
commonPrefixLengths[dim] = prefix;
|
||||||
|
@ -353,13 +411,6 @@ public class BKDReader implements Accountable {
|
||||||
}
|
}
|
||||||
//System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
|
//System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
|
||||||
}
|
}
|
||||||
for(int i=0;i<count;i++) {
|
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
|
||||||
int prefix = commonPrefixLengths[dim];
|
|
||||||
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
|
||||||
}
|
|
||||||
visitor.visit(docIDs[i], scratchPackedValue);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void intersect(IntersectState state,
|
private void intersect(IntersectState state,
|
||||||
|
|
|
@ -22,9 +22,12 @@ import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.function.IntFunction;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.store.ChecksumIndexInput;
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
@ -43,7 +46,6 @@ import org.apache.lucene.util.PriorityQueue;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
// - the compression is somewhat stupid now (delta vInt for 1024 docIDs, no compression for the byte[] values even though they have high locality)
|
|
||||||
// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
|
// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
|
||||||
// - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
|
// - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
|
||||||
// only index docIDs
|
// only index docIDs
|
||||||
|
@ -60,7 +62,7 @@ import org.apache.lucene.util.StringHelper;
|
||||||
* the requested <code>maxPointsInLeafNode</code>. Values that fall exactly
|
* the requested <code>maxPointsInLeafNode</code>. Values that fall exactly
|
||||||
* on a cell boundary may be in either cell.
|
* on a cell boundary may be in either cell.
|
||||||
*
|
*
|
||||||
* <p>The number of dimensions can be 1 to 255, but every byte[] value is fixed length.
|
* <p>The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
|
* See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
|
||||||
|
@ -69,7 +71,7 @@ import org.apache.lucene.util.StringHelper;
|
||||||
* and then uses up to the specified {@code maxMBSortInHeap} heap space for writing.
|
* and then uses up to the specified {@code maxMBSortInHeap} heap space for writing.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points, and
|
* <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points.
|
||||||
*
|
*
|
||||||
* @lucene.experimental */
|
* @lucene.experimental */
|
||||||
|
|
||||||
|
@ -78,7 +80,8 @@ public class BKDWriter implements Closeable {
|
||||||
public static final String CODEC_NAME = "BKD";
|
public static final String CODEC_NAME = "BKD";
|
||||||
public static final int VERSION_START = 0;
|
public static final int VERSION_START = 0;
|
||||||
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
|
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
|
||||||
public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS;
|
public static final int VERSION_COMPRESSED_VALUES = 2;
|
||||||
|
public static final int VERSION_CURRENT = VERSION_COMPRESSED_VALUES;
|
||||||
|
|
||||||
/** How many bytes each docs takes in the fixed-width offline format */
|
/** How many bytes each docs takes in the fixed-width offline format */
|
||||||
private final int bytesPerDoc;
|
private final int bytesPerDoc;
|
||||||
|
@ -312,6 +315,8 @@ public class BKDWriter implements Closeable {
|
||||||
/** Which leaf block we are up to */
|
/** Which leaf block we are up to */
|
||||||
private int blockID;
|
private int blockID;
|
||||||
|
|
||||||
|
private final byte[] packedValues;
|
||||||
|
|
||||||
public MergeReader(BKDReader bkd, MergeState.DocMap docMap) throws IOException {
|
public MergeReader(BKDReader bkd, MergeState.DocMap docMap) throws IOException {
|
||||||
this.bkd = bkd;
|
this.bkd = bkd;
|
||||||
state = new BKDReader.IntersectState(bkd.in.clone(),
|
state = new BKDReader.IntersectState(bkd.in.clone(),
|
||||||
|
@ -327,6 +332,7 @@ public class BKDWriter implements Closeable {
|
||||||
//System.out.println(" leaf fp=" + fp);
|
//System.out.println(" leaf fp=" + fp);
|
||||||
}
|
}
|
||||||
state.in.seek(minFP);
|
state.in.seek(minFP);
|
||||||
|
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean next() throws IOException {
|
public boolean next() throws IOException {
|
||||||
|
@ -341,18 +347,33 @@ public class BKDWriter implements Closeable {
|
||||||
docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
|
docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
|
||||||
assert docsInBlock > 0;
|
assert docsInBlock > 0;
|
||||||
docBlockUpto = 0;
|
docBlockUpto = 0;
|
||||||
for(int dim=0;dim<bkd.numDims;dim++) {
|
bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
|
||||||
int prefix = state.in.readVInt();
|
int i = 0;
|
||||||
state.commonPrefixLengths[dim] = prefix;
|
|
||||||
if (prefix > 0) {
|
@Override
|
||||||
state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim, prefix);
|
public void visit(int docID) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) throws IOException {
|
||||||
|
assert docID == state.scratchDocIDs[i];
|
||||||
|
System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
blockID++;
|
blockID++;
|
||||||
}
|
}
|
||||||
|
|
||||||
int oldDocID = state.scratchDocIDs[docBlockUpto++];
|
final int index = docBlockUpto++;
|
||||||
|
int oldDocID = state.scratchDocIDs[index];
|
||||||
|
|
||||||
int mappedDocID;
|
int mappedDocID;
|
||||||
if (docMap == null) {
|
if (docMap == null) {
|
||||||
|
@ -360,13 +381,11 @@ public class BKDWriter implements Closeable {
|
||||||
} else {
|
} else {
|
||||||
mappedDocID = docMap.get(oldDocID);
|
mappedDocID = docMap.get(oldDocID);
|
||||||
}
|
}
|
||||||
for(int dim=0;dim<bkd.numDims;dim++) {
|
|
||||||
int prefix = state.commonPrefixLengths[dim];
|
|
||||||
state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim + prefix, bkd.bytesPerDim - prefix);
|
|
||||||
}
|
|
||||||
if (mappedDocID != -1) {
|
if (mappedDocID != -1) {
|
||||||
// Not deleted!
|
// Not deleted!
|
||||||
docID = mappedDocID;
|
docID = mappedDocID;
|
||||||
|
System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -518,10 +537,21 @@ public class BKDWriter implements Closeable {
|
||||||
writeLeafBlockDocs(out, leafBlockDocIDs, 0, leafCount);
|
writeLeafBlockDocs(out, leafBlockDocIDs, 0, leafCount);
|
||||||
writeCommonPrefixes(out, commonPrefixLengths, firstPackedValue);
|
writeCommonPrefixes(out, commonPrefixLengths, firstPackedValue);
|
||||||
|
|
||||||
// Write the full values:
|
final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
|
||||||
for (int i=0;i<leafCount;i++) {
|
final BytesRef scratch = new BytesRef();
|
||||||
writeLeafBlockPackedValue(out, commonPrefixLengths, leafBlockPackedValues[i], 0);
|
|
||||||
}
|
{
|
||||||
|
scratch.length = packedBytesLength;
|
||||||
|
scratch.offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef apply(int i) {
|
||||||
|
scratch.bytes = leafBlockPackedValues[i];
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
|
||||||
|
|
||||||
leafCount = 0;
|
leafCount = 0;
|
||||||
}
|
}
|
||||||
|
@ -896,13 +926,57 @@ public class BKDWriter implements Closeable {
|
||||||
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {
|
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
|
||||||
int prefix = commonPrefixLengths[dim];
|
if (prefixLenSum == packedBytesLength) {
|
||||||
out.writeBytes(bytes, offset+dim*bytesPerDim+prefix, bytesPerDim-prefix);
|
// all values in this block are equal
|
||||||
|
out.writeByte((byte) -1);
|
||||||
|
} else {
|
||||||
|
assert commonPrefixLengths[sortedDim] < bytesPerDim;
|
||||||
|
out.writeByte((byte) sortedDim);
|
||||||
|
int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
|
||||||
|
commonPrefixLengths[sortedDim]++;
|
||||||
|
for (int i = 0; i < count; ) {
|
||||||
|
// do run-length compression on the byte at compressedByteOffset
|
||||||
|
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
|
||||||
|
assert runLen <= 0xff;
|
||||||
|
BytesRef first = packedValues.apply(i);
|
||||||
|
byte prefixByte = first.bytes[first.offset + compressedByteOffset];
|
||||||
|
out.writeByte(prefixByte);
|
||||||
|
out.writeByte((byte) runLen);
|
||||||
|
writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
|
||||||
|
i += runLen;
|
||||||
|
assert i <= count;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
|
||||||
|
for (int i = start; i < end; ++i) {
|
||||||
|
BytesRef ref = packedValues.apply(i);
|
||||||
|
assert ref.length == packedBytesLength;
|
||||||
|
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int prefix = commonPrefixLengths[dim];
|
||||||
|
out.writeBytes(ref.bytes, ref.offset + dim*bytesPerDim + prefix, bytesPerDim-prefix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset) {
|
||||||
|
BytesRef first = packedValues.apply(start);
|
||||||
|
byte b = first.bytes[first.offset + byteOffset];
|
||||||
|
for (int i = start + 1; i < end; ++i) {
|
||||||
|
BytesRef ref = packedValues.apply(i);
|
||||||
|
byte b2 = ref.bytes[ref.offset + byteOffset];
|
||||||
|
assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
|
||||||
|
if (b != b2) {
|
||||||
|
return i - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return end - start;
|
||||||
|
}
|
||||||
|
|
||||||
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
|
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
out.writeVInt(commonPrefixes[dim]);
|
out.writeVInt(commonPrefixes[dim]);
|
||||||
|
@ -1058,6 +1132,11 @@ public class BKDWriter implements Closeable {
|
||||||
if (nodeID >= leafNodeOffset) {
|
if (nodeID >= leafNodeOffset) {
|
||||||
|
|
||||||
// Leaf node: write block
|
// Leaf node: write block
|
||||||
|
// We can write the block in any order so by default we write it sorted by the dimension that has the
|
||||||
|
// least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
|
||||||
|
int sortedDim = 0;
|
||||||
|
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||||
|
|
||||||
for (int dim=0;dim<numDims;dim++) {
|
for (int dim=0;dim<numDims;dim++) {
|
||||||
if (slices[dim].writer instanceof HeapPointWriter == false) {
|
if (slices[dim].writer instanceof HeapPointWriter == false) {
|
||||||
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
|
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
|
||||||
|
@ -1081,9 +1160,29 @@ public class BKDWriter implements Closeable {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int prefix = commonPrefixLengths[dim];
|
||||||
|
if (prefix < bytesPerDim) {
|
||||||
|
int cardinality = 1;
|
||||||
|
byte previous = scratch1[offset + prefix];
|
||||||
|
for (long i = 1; i < source.count; ++i) {
|
||||||
|
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
|
||||||
|
byte b = scratch2[offset + prefix];
|
||||||
|
assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
|
||||||
|
if (b != previous) {
|
||||||
|
cardinality++;
|
||||||
|
previous = b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert cardinality <= 256;
|
||||||
|
if (cardinality < sortedDimCardinality) {
|
||||||
|
sortedDim = dim;
|
||||||
|
sortedDimCardinality = cardinality;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PathSlice source = slices[0];
|
PathSlice source = slices[sortedDim];
|
||||||
|
|
||||||
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
|
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
|
||||||
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
||||||
|
@ -1105,15 +1204,21 @@ public class BKDWriter implements Closeable {
|
||||||
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
|
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
|
||||||
|
|
||||||
// Write the full values:
|
// Write the full values:
|
||||||
byte[] lastPackedValue = new byte[bytesPerDim];
|
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
|
||||||
for (int i=0;i<count;i++) {
|
final BytesRef scratch = new BytesRef();
|
||||||
heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratchBytesRef);
|
|
||||||
assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchBytesRef.bytes, scratchBytesRef.offset);
|
|
||||||
|
|
||||||
// Make sure this value does in fact fall within this leaf cell:
|
{
|
||||||
assert valueInBounds(scratchBytesRef, minPackedValue, maxPackedValue);
|
scratch.length = packedBytesLength;
|
||||||
writeLeafBlockPackedValue(out, commonPrefixLengths, scratchBytesRef.bytes, scratchBytesRef.offset);
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public BytesRef apply(int i) {
|
||||||
|
heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assert valuesInOrderAndBounds(count, minPackedValue, maxPackedValue, packedValues);
|
||||||
|
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Inner node: partition/recurse
|
// Inner node: partition/recurse
|
||||||
|
@ -1215,6 +1320,20 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// only called from assert
|
||||||
|
private boolean valuesInOrderAndBounds(int count, byte[] minPackedValue, byte[] maxPackedValue, IntFunction<BytesRef> values) throws IOException {
|
||||||
|
byte[] lastPackedValue = new byte[bytesPerDim];
|
||||||
|
for (int i=0;i<count;i++) {
|
||||||
|
BytesRef packedValue = values.apply(i);
|
||||||
|
assert packedValue.length == packedBytesLength;
|
||||||
|
assert numDims != 1 || valueInOrder(i, lastPackedValue, packedValue.bytes, packedValue.offset);
|
||||||
|
|
||||||
|
// Make sure this value does in fact fall within this leaf cell:
|
||||||
|
assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// only called from assert
|
// only called from assert
|
||||||
private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset) {
|
private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset) {
|
||||||
if (ord > 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, packedValueOffset) > 0) {
|
if (ord > 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, packedValueOffset) > 0) {
|
||||||
|
|
|
@ -507,6 +507,35 @@ public class TestBKD extends LuceneTestCase {
|
||||||
verify(docValues, null, numDims, numBytesPerDim);
|
verify(docValues, null, numDims, numBytesPerDim);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this should trigger run-length compression with lengths that are greater than 255
|
||||||
|
public void testOneDimTwoValues() throws Exception {
|
||||||
|
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||||
|
int numDims = TestUtil.nextInt(random(), 1, 5);
|
||||||
|
|
||||||
|
int numDocs = atLeast(1000);
|
||||||
|
int theDim = random().nextInt(numDims);
|
||||||
|
byte[] value1 = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(value1);
|
||||||
|
byte[] value2 = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(value2);
|
||||||
|
byte[][][] docValues = new byte[numDocs][][];
|
||||||
|
|
||||||
|
for(int docID=0;docID<numDocs;docID++) {
|
||||||
|
byte[][] values = new byte[numDims][];
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
if (dim == theDim) {
|
||||||
|
values[dim] = random().nextBoolean() ? value1 : value2;
|
||||||
|
} else {
|
||||||
|
values[dim] = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(values[dim]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docValues[docID] = values;
|
||||||
|
}
|
||||||
|
|
||||||
|
verify(docValues, null, numDims, numBytesPerDim);
|
||||||
|
}
|
||||||
|
|
||||||
public void testMultiValued() throws Exception {
|
public void testMultiValued() throws Exception {
|
||||||
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||||
int numDims = TestUtil.nextInt(random(), 1, 5);
|
int numDims = TestUtil.nextInt(random(), 1, 5);
|
||||||
|
|
|
@ -327,6 +327,35 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
|
||||||
verify(docValues, null, numDims, numBytesPerDim);
|
verify(docValues, null, numDims, numBytesPerDim);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this should trigger run-length compression with lengths that are greater than 255
|
||||||
|
public void testOneDimTwoValues() throws Exception {
|
||||||
|
int numBytesPerDim = TestUtil.nextInt(random(), 2, PointValues.MAX_NUM_BYTES);
|
||||||
|
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
|
||||||
|
|
||||||
|
int numDocs = atLeast(1000);
|
||||||
|
int theDim = random().nextInt(numDims);
|
||||||
|
byte[] value1 = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(value1);
|
||||||
|
byte[] value2 = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(value2);
|
||||||
|
byte[][][] docValues = new byte[numDocs][][];
|
||||||
|
|
||||||
|
for(int docID=0;docID<numDocs;docID++) {
|
||||||
|
byte[][] values = new byte[numDims][];
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
if (dim == theDim) {
|
||||||
|
values[dim] = random().nextBoolean() ? value1 : value2;
|
||||||
|
} else {
|
||||||
|
values[dim] = new byte[numBytesPerDim];
|
||||||
|
random().nextBytes(values[dim]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docValues[docID] = values;
|
||||||
|
}
|
||||||
|
|
||||||
|
verify(docValues, null, numDims, numBytesPerDim);
|
||||||
|
}
|
||||||
|
|
||||||
// Tests on N-dimensional points where each dimension is a BigInteger
|
// Tests on N-dimensional points where each dimension is a BigInteger
|
||||||
public void testBigIntNDims() throws Exception {
|
public void testBigIntNDims() throws Exception {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue