LUCENE-8879: Improve BKDRadixSelector tests

This change adds explicit test for the sorting capabilities.
This commit is contained in:
Ignacio Vera 2019-06-26 09:45:44 +02:00 committed by GitHub
parent 5bf023cf19
commit 36eaf75b1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 211 additions and 21 deletions

View File

@ -147,6 +147,8 @@ Other
* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
* LUCENE-8879: Improve BKDRadixSelector tests. (Ignacio Vera)
======================= Lucene 8.1.1 =======================
(No Changes)

View File

@ -48,7 +48,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, 3);
points.close();
PointWriter copy = copyPoints(dir,points, packedLength);
verify(dir, copy, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
verify(dir, copy, dimensions, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
dir.close();
}
@ -79,9 +79,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1);
int sortedOnHeap = random().nextInt(5000);
int dimensions = TestUtil.nextInt(random(), 1, 8);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
int packedLength = dimensions * bytesPerDimensions;
int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
for (int i =0; i < values; i++) {
@ -89,7 +90,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, i);
}
points.close();
verify(dir, points, dimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
verify(dir, points, dataDimensions, indexDimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@ -112,7 +113,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
}
points.close();
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
verify(dir, points, dimensions, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@ -121,9 +122,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
int dimensions = TestUtil.nextInt(random(), 1, 8);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
int packedLength = dimensions * bytesPerDimensions;
int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
random().nextBytes(value);
@ -135,7 +137,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
}
points.close();
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@ -144,9 +146,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
int dimensions = TestUtil.nextInt(random(), 1, 8);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
int packedLength = dimensions * bytesPerDimensions;
int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
random().nextBytes(value);
@ -154,7 +157,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, 0);
}
points.close();
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@ -163,9 +166,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
int dimensions = TestUtil.nextInt(random(), 1, 8);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
int packedLength = dimensions * bytesPerDimensions;
int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
int numberValues = random().nextInt(8) + 2;
byte[][] differentValues = new byte[numberValues][packedLength];
@ -176,14 +180,37 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(differentValues[random().nextInt(numberValues)], i);
}
points.close();
verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
private void verify(Directory dir, PointWriter points, int dimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
BKDRadixSelector radixSelector = new BKDRadixSelector(dimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
//we check for each dimension
for (int splitDim =0; splitDim < dimensions; splitDim++) {
public void testRandomDataDimDiffValues() throws IOException {
int values = atLeast(15000);
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
byte[] dataValue = new byte[(dataDimensions - indexDimensions) * bytesPerDimensions];
random().nextBytes(value);
for (int i =0; i < values; i++) {
random().nextBytes(dataValue);
System.arraycopy(dataValue, 0, value, indexDimensions * bytesPerDimensions, (dataDimensions - indexDimensions) * bytesPerDimensions);
points.append(value, i);
}
points.close();
verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
//we only split by indexed dimension so we check for each only those dimension
for (int splitDim = 0; splitDim < indexDimensions; splitDim++) {
//We need to make a copy of the data as it is deleted in the process
BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(dir, points, packedLength), 0, points.count());
int commonPrefixLengthInput = getRandomCommonPrefix(inputSlice, bytesPerDimensions, splitDim);
@ -208,7 +235,6 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.destroy();
}
private PointWriter copyPoints(Directory dir, PointWriter points, int packedLength) throws IOException {
try (PointWriter copy = getRandomPointWriter(dir, points.count(), packedLength);
PointReader reader = points.getReader(0, points.count())) {
@ -318,5 +344,4 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
return docID;
}
}

View File

@ -0,0 +1,163 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.bkd;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestBKDRadixSort extends LuceneTestCase {
public void testRandom() throws IOException {
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
int packedBytesLength = dataDimensions * bytesPerDim;
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
byte[] value = new byte[packedBytesLength];
for (int i = 0; i < numPoints; i++) {
random().nextBytes(value);
points.append(value, i);
}
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
}
public void testRandomAllEquals() throws IOException {
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
int packedBytesLength = dataDimensions * bytesPerDim;
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
byte[] value = new byte[packedBytesLength];
random().nextBytes(value);
for (int i = 0; i < numPoints; i++) {
points.append(value, random().nextInt(numPoints));
}
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
}
public void testRandomLastByteTwoValues() throws IOException {
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
int packedBytesLength = dataDimensions * bytesPerDim;
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
byte[] value = new byte[packedBytesLength];
random().nextBytes(value);
for (int i = 0; i < numPoints; i++) {
if (random().nextBoolean()) {
points.append(value, 1);
} else {
points.append(value, 2);
}
}
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
}
public void testRandomFewDifferentValues() throws IOException {
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
int packedBytesLength = dataDimensions * bytesPerDim;
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
int numberValues = random().nextInt(8) + 2;
byte[][] differentValues = new byte[numberValues][packedBytesLength];
for (int i = 0; i < numberValues; i++) {
random().nextBytes(differentValues[i]);
}
for (int i = 0; i < numPoints; i++) {
points.append(differentValues[random().nextInt(numberValues)], i);
}
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
}
public void testRandomDataDimDifferent() throws IOException {
int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
int indexDimensions = TestUtil.nextInt(random(), 1, 8);
int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
int packedBytesLength = dataDimensions * bytesPerDim;
HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
byte[] value = new byte[packedBytesLength];
int totalDataDimension = dataDimensions - indexDimensions;
byte[] dataDimensionValues = new byte[totalDataDimension * bytesPerDim];
random().nextBytes(value);
for (int i = 0; i < numPoints; i++) {
random().nextBytes(dataDimensionValues);
System.arraycopy(dataDimensionValues, 0, value, indexDimensions * bytesPerDim, totalDataDimension * bytesPerDim);
points.append(value, random().nextInt(numPoints));
}
verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
}
private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{
int packedBytesLength = dataDimensions * bytesPerDim;
Directory dir = newDirectory();
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test");
// we check for each dimension
for (int splitDim = 0; splitDim < dataDimensions; splitDim++) {
radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim));
byte[] previous = new byte[bytesPerDim * dataDimensions];
int previousDocId = -1;
Arrays.fill(previous, (byte) 0);
int dimOffset = splitDim * bytesPerDim;
for (int j = start; j < end; j++) {
PointValue pointValue = points.getPackedValueSlice(j);
BytesRef value = pointValue.packedValue();
int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim);
assertTrue(cmp >= 0);
if (cmp == 0) {
assertTrue(pointValue.docID() >= previousDocId);
}
System.arraycopy(value.bytes, value.offset, previous, 0, packedBytesLength);
previousDocId = pointValue.docID();
}
}
dir.close();
}
/** returns a common prefix length equal or lower than the current one */
private int getRandomCommonPrefix(HeapPointWriter points, int start, int end, int bytesPerDimension, int sortDim) {
int commonPrefixLength = bytesPerDimension;
PointValue value = points.getPackedValueSlice(start);
BytesRef bytesRef = value.packedValue();
byte[] firstValue = new byte[bytesPerDimension];
int offset = sortDim * bytesPerDimension;
System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, bytesPerDimension);
for (int i = start + 1; i < end; i++) {
value = points.getPackedValueSlice(i);
bytesRef = value.packedValue();
int diff = Arrays.mismatch(bytesRef.bytes, bytesRef.offset + offset, bytesRef.offset + offset + bytesPerDimension, firstValue, 0, bytesPerDimension);
if (diff != -1 && commonPrefixLength > diff) {
if (diff == 0) {
return diff;
}
commonPrefixLength = diff;
}
}
return (random().nextBoolean()) ? commonPrefixLength : random().nextInt(commonPrefixLength);
}
}