diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bbe2ccc2716..6652c7c0cad 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -121,7 +121,9 @@ Other * LUCENE-8778: Define analyzer SPI names as static final fields and document the names in Javadocs. (Tomoko Uchida, Uwe Schindler) -* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera) +* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera) + +* LUCENE-8879: Improve BKDRadixSelector tests. (Ignacio Vera) ======================= Lucene 8.1.1 ======================= (No Changes) diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java index c2908258d51..456667339a7 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java @@ -49,7 +49,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { points.append(value, 3); points.close(); PointWriter copy = copyPoints(dir,points, packedLength); - verify(dir, copy, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0); + verify(dir, copy, dimensions, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0); dir.close(); } @@ -80,9 +80,10 @@ public class TestBKDRadixSelector extends LuceneTestCase { } int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1); int sortedOnHeap = random().nextInt(5000); - int dimensions = TestUtil.nextInt(random(), 1, 8); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30); - int packedLength = dimensions * bytesPerDimensions; + int packedLength = dataDimensions * bytesPerDimensions; PointWriter points = getRandomPointWriter(dir, values, packedLength); byte[] value = new byte[packedLength]; for (int i =0; i < values; i++) { @@ -90,7 +91,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { points.append(value, i); } points.close(); - verify(dir, points, dimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + verify(dir, points, dataDimensions, indexDimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); dir.close(); } @@ -113,7 +114,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { } } points.close(); - verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + verify(dir, points, dimensions, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); dir.close(); } @@ -122,9 +123,10 @@ public class TestBKDRadixSelector extends LuceneTestCase { Directory dir = getDirectory(values); int partitionPoint = random().nextInt(values); int sortedOnHeap = random().nextInt(5000); - int dimensions = TestUtil.nextInt(random(), 1, 8); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30); - int packedLength = dimensions * bytesPerDimensions; + int packedLength = dataDimensions * bytesPerDimensions; PointWriter points = getRandomPointWriter(dir, values, packedLength); byte[] value = new byte[packedLength]; random().nextBytes(value); @@ -136,7 +138,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { } } points.close(); - verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); dir.close(); } @@ -145,9 +147,10 @@ public class TestBKDRadixSelector extends LuceneTestCase { Directory dir = getDirectory(values); int partitionPoint = random().nextInt(values); int sortedOnHeap = random().nextInt(5000); - int dimensions = TestUtil.nextInt(random(), 1, 8); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30); - int packedLength = dimensions * bytesPerDimensions; + int packedLength = dataDimensions * bytesPerDimensions; PointWriter points = getRandomPointWriter(dir, values, packedLength); byte[] value = new byte[packedLength]; random().nextBytes(value); @@ -155,7 +158,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { points.append(value, 0); } points.close(); - verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); dir.close(); } @@ -164,9 +167,10 @@ public class TestBKDRadixSelector extends LuceneTestCase { Directory dir = getDirectory(values); int partitionPoint = random().nextInt(values); int sortedOnHeap = random().nextInt(5000); - int dimensions = TestUtil.nextInt(random(), 1, 8); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30); - int packedLength = dimensions * bytesPerDimensions; + int packedLength = dataDimensions * bytesPerDimensions; PointWriter points = getRandomPointWriter(dir, values, packedLength); int numberValues = random().nextInt(8) + 2; byte[][] differentValues = new byte[numberValues][packedLength]; @@ -177,14 +181,37 @@ public class TestBKDRadixSelector extends LuceneTestCase { points.append(differentValues[random().nextInt(numberValues)], i); } points.close(); - verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); dir.close(); } - private void verify(Directory dir, PointWriter points, int dimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{ - BKDRadixSelector radixSelector = new BKDRadixSelector(dimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); - //we check for each dimension - for (int splitDim =0; splitDim < dimensions; splitDim++) { + public void testRandomDataDimDiffValues() throws IOException { + int values = atLeast(15000); + Directory dir = getDirectory(values); + int partitionPoint = random().nextInt(values); + int sortedOnHeap = random().nextInt(5000); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30); + int packedLength = dataDimensions * bytesPerDimensions; + PointWriter points = getRandomPointWriter(dir, values, packedLength); + byte[] value = new byte[packedLength]; + byte[] dataValue = new byte[(dataDimensions - indexDimensions) * bytesPerDimensions]; + random().nextBytes(value); + for (int i =0; i < values; i++) { + random().nextBytes(dataValue); + System.arraycopy(dataValue, 0, value, indexDimensions * bytesPerDimensions, (dataDimensions - indexDimensions) * bytesPerDimensions); + points.append(value, i); + } + points.close(); + verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap); + dir.close(); + } + + private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{ + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); + //we only split by indexed dimension so we check for each only those dimension + for (int splitDim = 0; splitDim < indexDimensions; splitDim++) { //We need to make a copy of the data as it is deleted in the process BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(dir, points, packedLength), 0, points.count()); int commonPrefixLengthInput = getRandomCommonPrefix(inputSlice, bytesPerDimensions, splitDim); @@ -209,7 +236,6 @@ public class TestBKDRadixSelector extends LuceneTestCase { points.destroy(); } - private PointWriter copyPoints(Directory dir, PointWriter points, int packedLength) throws IOException { try (PointWriter copy = getRandomPointWriter(dir, points.count(), packedLength); PointReader reader = points.getReader(0, points.count())) { @@ -319,5 +345,4 @@ public class TestBKDRadixSelector extends LuceneTestCase { } return docID; } - } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java new file mode 100644 index 00000000000..a7cebd024af --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.bkd; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestBKDRadixSort extends LuceneTestCase { + + public void testRandom() throws IOException { + int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDim = TestUtil.nextInt(random(), 2, 30); + int packedBytesLength = dataDimensions * bytesPerDim; + HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength); + byte[] value = new byte[packedBytesLength]; + for (int i = 0; i < numPoints; i++) { + random().nextBytes(value); + points.append(value, i); + } + verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim); + } + + public void testRandomAllEquals() throws IOException { + int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDim = TestUtil.nextInt(random(), 2, 30); + int packedBytesLength = dataDimensions * bytesPerDim; + HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength); + byte[] value = new byte[packedBytesLength]; + random().nextBytes(value); + for (int i = 0; i < numPoints; i++) { + points.append(value, random().nextInt(numPoints)); + } + verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim); + } + + public void testRandomLastByteTwoValues() throws IOException { + int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDim = TestUtil.nextInt(random(), 2, 30); + int packedBytesLength = dataDimensions * bytesPerDim; + HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength); + byte[] value = new byte[packedBytesLength]; + random().nextBytes(value); + for (int i = 0; i < numPoints; i++) { + if (random().nextBoolean()) { + points.append(value, 1); + } else { + points.append(value, 2); + } + } + verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim); + } + + public void testRandomFewDifferentValues() throws IOException { + int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDim = TestUtil.nextInt(random(), 2, 30); + int packedBytesLength = dataDimensions * bytesPerDim; + HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength); + int numberValues = random().nextInt(8) + 2; + byte[][] differentValues = new byte[numberValues][packedBytesLength]; + for (int i = 0; i < numberValues; i++) { + random().nextBytes(differentValues[i]); + } + for (int i = 0; i < numPoints; i++) { + points.append(differentValues[random().nextInt(numberValues)], i); + } + verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim); + } + + public void testRandomDataDimDifferent() throws IOException { + int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + int indexDimensions = TestUtil.nextInt(random(), 1, 8); + int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8); + int bytesPerDim = TestUtil.nextInt(random(), 2, 30); + int packedBytesLength = dataDimensions * bytesPerDim; + HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength); + byte[] value = new byte[packedBytesLength]; + int totalDataDimension = dataDimensions - indexDimensions; + byte[] dataDimensionValues = new byte[totalDataDimension * bytesPerDim]; + random().nextBytes(value); + for (int i = 0; i < numPoints; i++) { + random().nextBytes(dataDimensionValues); + System.arraycopy(dataDimensionValues, 0, value, indexDimensions * bytesPerDim, totalDataDimension * bytesPerDim); + points.append(value, random().nextInt(numPoints)); + } + verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim); + } + + private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{ + int packedBytesLength = dataDimensions * bytesPerDim; + Directory dir = newDirectory(); + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test"); + // we check for each dimension + for (int splitDim = 0; splitDim < dataDimensions; splitDim++) { + radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim)); + byte[] previous = new byte[bytesPerDim * dataDimensions]; + int previousDocId = -1; + Arrays.fill(previous, (byte) 0); + int dimOffset = splitDim * bytesPerDim; + for (int j = start; j < end; j++) { + PointValue pointValue = points.getPackedValueSlice(j); + BytesRef value = pointValue.packedValue(); + int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim); + assertTrue(cmp >= 0); + if (cmp == 0) { + assertTrue(pointValue.docID() >= previousDocId); + } + System.arraycopy(value.bytes, value.offset, previous, 0, packedBytesLength); + previousDocId = pointValue.docID(); + } + } + dir.close(); + } + + /** returns a common prefix length equal or lower than the current one */ + private int getRandomCommonPrefix(HeapPointWriter points, int start, int end, int bytesPerDimension, int sortDim) { + int commonPrefixLength = bytesPerDimension; + PointValue value = points.getPackedValueSlice(start); + BytesRef bytesRef = value.packedValue(); + byte[] firstValue = new byte[bytesPerDimension]; + int offset = sortDim * bytesPerDimension; + System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, bytesPerDimension); + for (int i = start + 1; i < end; i++) { + value = points.getPackedValueSlice(i); + bytesRef = value.packedValue(); + int diff = Arrays.mismatch(bytesRef.bytes, bytesRef.offset + offset, bytesRef.offset + offset + bytesPerDimension, firstValue, 0, bytesPerDimension); + if (diff != -1 && commonPrefixLength > diff) { + if (diff == 0) { + return diff; + } + commonPrefixLength = diff; + } + } + return (random().nextBoolean()) ? commonPrefixLength : random().nextInt(commonPrefixLength); + } +}