mirror of https://github.com/apache/lucene.git
LUCENE-8879: Improve BKDRadixSelector tests
This change adds explicit test for the sorting capabilities.
This commit is contained in:
parent
9fd51bab09
commit
afb4b928bc
|
@ -121,7 +121,9 @@ Other
|
|||
* LUCENE-8778: Define analyzer SPI names as static final fields and document the names in Javadocs.
|
||||
(Tomoko Uchida, Uwe Schindler)
|
||||
|
||||
* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
|
||||
* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
|
||||
|
||||
* LUCENE-8879: Improve BKDRadixSelector tests. (Ignacio Vera)
|
||||
|
||||
======================= Lucene 8.1.1 =======================
|
||||
(No Changes)
|
||||
|
|
|
@ -49,7 +49,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
points.append(value, 3);
|
||||
points.close();
|
||||
PointWriter copy = copyPoints(dir,points, packedLength);
|
||||
verify(dir, copy, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
|
||||
verify(dir, copy, dimensions, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
@ -80,9 +80,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
}
|
||||
int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1);
|
||||
int sortedOnHeap = random().nextInt(5000);
|
||||
int dimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedLength = dimensions * bytesPerDimensions;
|
||||
int packedLength = dataDimensions * bytesPerDimensions;
|
||||
PointWriter points = getRandomPointWriter(dir, values, packedLength);
|
||||
byte[] value = new byte[packedLength];
|
||||
for (int i =0; i < values; i++) {
|
||||
|
@ -90,7 +91,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
points.append(value, i);
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
verify(dir, points, dataDimensions, indexDimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
@ -113,7 +114,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
verify(dir, points, dimensions, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
@ -122,9 +123,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
Directory dir = getDirectory(values);
|
||||
int partitionPoint = random().nextInt(values);
|
||||
int sortedOnHeap = random().nextInt(5000);
|
||||
int dimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedLength = dimensions * bytesPerDimensions;
|
||||
int packedLength = dataDimensions * bytesPerDimensions;
|
||||
PointWriter points = getRandomPointWriter(dir, values, packedLength);
|
||||
byte[] value = new byte[packedLength];
|
||||
random().nextBytes(value);
|
||||
|
@ -136,7 +138,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
@ -145,9 +147,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
Directory dir = getDirectory(values);
|
||||
int partitionPoint = random().nextInt(values);
|
||||
int sortedOnHeap = random().nextInt(5000);
|
||||
int dimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedLength = dimensions * bytesPerDimensions;
|
||||
int packedLength = dataDimensions * bytesPerDimensions;
|
||||
PointWriter points = getRandomPointWriter(dir, values, packedLength);
|
||||
byte[] value = new byte[packedLength];
|
||||
random().nextBytes(value);
|
||||
|
@ -155,7 +158,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
points.append(value, 0);
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
@ -164,9 +167,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
Directory dir = getDirectory(values);
|
||||
int partitionPoint = random().nextInt(values);
|
||||
int sortedOnHeap = random().nextInt(5000);
|
||||
int dimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedLength = dimensions * bytesPerDimensions;
|
||||
int packedLength = dataDimensions * bytesPerDimensions;
|
||||
PointWriter points = getRandomPointWriter(dir, values, packedLength);
|
||||
int numberValues = random().nextInt(8) + 2;
|
||||
byte[][] differentValues = new byte[numberValues][packedLength];
|
||||
|
@ -177,14 +181,37 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
points.append(differentValues[random().nextInt(numberValues)], i);
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void verify(Directory dir, PointWriter points, int dimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
|
||||
BKDRadixSelector radixSelector = new BKDRadixSelector(dimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
|
||||
//we check for each dimension
|
||||
for (int splitDim =0; splitDim < dimensions; splitDim++) {
|
||||
public void testRandomDataDimDiffValues() throws IOException {
|
||||
int values = atLeast(15000);
|
||||
Directory dir = getDirectory(values);
|
||||
int partitionPoint = random().nextInt(values);
|
||||
int sortedOnHeap = random().nextInt(5000);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedLength = dataDimensions * bytesPerDimensions;
|
||||
PointWriter points = getRandomPointWriter(dir, values, packedLength);
|
||||
byte[] value = new byte[packedLength];
|
||||
byte[] dataValue = new byte[(dataDimensions - indexDimensions) * bytesPerDimensions];
|
||||
random().nextBytes(value);
|
||||
for (int i =0; i < values; i++) {
|
||||
random().nextBytes(dataValue);
|
||||
System.arraycopy(dataValue, 0, value, indexDimensions * bytesPerDimensions, (dataDimensions - indexDimensions) * bytesPerDimensions);
|
||||
points.append(value, i);
|
||||
}
|
||||
points.close();
|
||||
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
|
||||
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
|
||||
//we only split by indexed dimension so we check for each only those dimension
|
||||
for (int splitDim = 0; splitDim < indexDimensions; splitDim++) {
|
||||
//We need to make a copy of the data as it is deleted in the process
|
||||
BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(dir, points, packedLength), 0, points.count());
|
||||
int commonPrefixLengthInput = getRandomCommonPrefix(inputSlice, bytesPerDimensions, splitDim);
|
||||
|
@ -209,7 +236,6 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
points.destroy();
|
||||
}
|
||||
|
||||
|
||||
private PointWriter copyPoints(Directory dir, PointWriter points, int packedLength) throws IOException {
|
||||
try (PointWriter copy = getRandomPointWriter(dir, points.count(), packedLength);
|
||||
PointReader reader = points.getReader(0, points.count())) {
|
||||
|
@ -319,5 +345,4 @@ public class TestBKDRadixSelector extends LuceneTestCase {
|
|||
}
|
||||
return docID;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.util.bkd;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestBKDRadixSort extends LuceneTestCase {
|
||||
|
||||
public void testRandom() throws IOException {
|
||||
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
|
||||
byte[] value = new byte[packedBytesLength];
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
random().nextBytes(value);
|
||||
points.append(value, i);
|
||||
}
|
||||
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
|
||||
}
|
||||
|
||||
public void testRandomAllEquals() throws IOException {
|
||||
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
|
||||
byte[] value = new byte[packedBytesLength];
|
||||
random().nextBytes(value);
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
points.append(value, random().nextInt(numPoints));
|
||||
}
|
||||
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
|
||||
}
|
||||
|
||||
public void testRandomLastByteTwoValues() throws IOException {
|
||||
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
|
||||
byte[] value = new byte[packedBytesLength];
|
||||
random().nextBytes(value);
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
if (random().nextBoolean()) {
|
||||
points.append(value, 1);
|
||||
} else {
|
||||
points.append(value, 2);
|
||||
}
|
||||
}
|
||||
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
|
||||
}
|
||||
|
||||
public void testRandomFewDifferentValues() throws IOException {
|
||||
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
|
||||
int numberValues = random().nextInt(8) + 2;
|
||||
byte[][] differentValues = new byte[numberValues][packedBytesLength];
|
||||
for (int i = 0; i < numberValues; i++) {
|
||||
random().nextBytes(differentValues[i]);
|
||||
}
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
points.append(differentValues[random().nextInt(numberValues)], i);
|
||||
}
|
||||
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
|
||||
}
|
||||
|
||||
public void testRandomDataDimDifferent() throws IOException {
|
||||
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
|
||||
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
|
||||
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
|
||||
byte[] value = new byte[packedBytesLength];
|
||||
int totalDataDimension = dataDimensions - indexDimensions;
|
||||
byte[] dataDimensionValues = new byte[totalDataDimension * bytesPerDim];
|
||||
random().nextBytes(value);
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
random().nextBytes(dataDimensionValues);
|
||||
System.arraycopy(dataDimensionValues, 0, value, indexDimensions * bytesPerDim, totalDataDimension * bytesPerDim);
|
||||
points.append(value, random().nextInt(numPoints));
|
||||
}
|
||||
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
|
||||
}
|
||||
|
||||
private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{
|
||||
int packedBytesLength = dataDimensions * bytesPerDim;
|
||||
Directory dir = newDirectory();
|
||||
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test");
|
||||
// we check for each dimension
|
||||
for (int splitDim = 0; splitDim < dataDimensions; splitDim++) {
|
||||
radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim));
|
||||
byte[] previous = new byte[bytesPerDim * dataDimensions];
|
||||
int previousDocId = -1;
|
||||
Arrays.fill(previous, (byte) 0);
|
||||
int dimOffset = splitDim * bytesPerDim;
|
||||
for (int j = start; j < end; j++) {
|
||||
PointValue pointValue = points.getPackedValueSlice(j);
|
||||
BytesRef value = pointValue.packedValue();
|
||||
int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim);
|
||||
assertTrue(cmp >= 0);
|
||||
if (cmp == 0) {
|
||||
assertTrue(pointValue.docID() >= previousDocId);
|
||||
}
|
||||
System.arraycopy(value.bytes, value.offset, previous, 0, packedBytesLength);
|
||||
previousDocId = pointValue.docID();
|
||||
}
|
||||
}
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** returns a common prefix length equal or lower than the current one */
|
||||
private int getRandomCommonPrefix(HeapPointWriter points, int start, int end, int bytesPerDimension, int sortDim) {
|
||||
int commonPrefixLength = bytesPerDimension;
|
||||
PointValue value = points.getPackedValueSlice(start);
|
||||
BytesRef bytesRef = value.packedValue();
|
||||
byte[] firstValue = new byte[bytesPerDimension];
|
||||
int offset = sortDim * bytesPerDimension;
|
||||
System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, bytesPerDimension);
|
||||
for (int i = start + 1; i < end; i++) {
|
||||
value = points.getPackedValueSlice(i);
|
||||
bytesRef = value.packedValue();
|
||||
int diff = Arrays.mismatch(bytesRef.bytes, bytesRef.offset + offset, bytesRef.offset + offset + bytesPerDimension, firstValue, 0, bytesPerDimension);
|
||||
if (diff != -1 && commonPrefixLength > diff) {
|
||||
if (diff == 0) {
|
||||
return diff;
|
||||
}
|
||||
commonPrefixLength = diff;
|
||||
}
|
||||
}
|
||||
return (random().nextBoolean()) ? commonPrefixLength : random().nextInt(commonPrefixLength);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue