LUCENE-9932: Performance improvement for BKD index building (#91)

This commit is contained in:
neoReMinD 2021-05-14 15:33:43 +08:00 committed by GitHub
parent f215a55bc9
commit fd4b3c81d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 434 additions and 59 deletions

View File

@ -368,6 +368,8 @@ Improvements
Optimizations
---------------------
* LUCENE-9932: Performance improvement for BKD index building (neoremind)
* LUCENE-9827: Speed up merging of stored fields and term vectors for smaller segments.
(Daniel Mitterdorfer, Dimitrios Liapis, Adrien Grand, Robert Muir)

View File

@ -41,4 +41,10 @@ public abstract class MutablePointValues extends PointValues {
/** Swap the i-th and j-th values. */
public abstract void swap(int i, int j);
/** Save the i-th value into the j-th position in temporary storage. */
public abstract void save(int i, int j);
/** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
public abstract void restore(int i, int j);
}

View File

@ -89,6 +89,7 @@ class PointValuesWriter {
PointValues points =
new MutablePointValues() {
final int[] ords = new int[numPoints];
int[] temp;
{
for (int i = 0; i < numPoints; ++i) {
@ -172,6 +173,21 @@ class PointValuesWriter {
final long offset = (long) packedBytesLength * ords[i] + k;
return bytes.readByte(offset);
}
@Override
public void save(int i, int j) {
if (temp == null) {
temp = new int[ords.length];
}
temp[j] = ords[i];
}
@Override
public void restore(int i, int j) {
if (temp != null) {
System.arraycopy(temp, i, ords, i, j - i);
}
}
};
final PointValues values;
@ -291,5 +307,15 @@ class PointValuesWriter {
public void swap(int i, int j) {
in.swap(i, j);
}
@Override
public void save(int i, int j) {
in.save(i, j);
}
@Override
public void restore(int i, int j) {
in.restore(i, j);
}
}
}

View File

@ -33,7 +33,7 @@ public abstract class MSBRadixSorter extends Sorter {
// locality)
private static final int LEVEL_THRESHOLD = 8;
// size of histograms: 256 + 1 to indicate that the string is finished
private static final int HISTOGRAM_SIZE = 257;
protected static final int HISTOGRAM_SIZE = 257;
// buckets below this size will be sorted with introsort
private static final int LENGTH_THRESHOLD = 100;
@ -42,7 +42,7 @@ public abstract class MSBRadixSorter extends Sorter {
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
private final int[] commonPrefix;
private final int maxLength;
protected final int maxLength;
/**
* Sole constructor.
@ -128,7 +128,7 @@ public abstract class MSBRadixSorter extends Sorter {
sort(from, to, 0, 0);
}
private void sort(int from, int to, int k, int l) {
protected void sort(int from, int to, int k, int l) {
if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
introSort(from, to, k);
} else {
@ -202,7 +202,7 @@ public abstract class MSBRadixSorter extends Sorter {
}
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
protected int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
@ -284,7 +284,7 @@ public abstract class MSBRadixSorter extends Sorter {
* @param startOffsets start offsets per bucket
* @param endOffsets end offsets per bucket
*/
private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
// reorder in place, like the dutch flag problem
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
final int limit = endOffsets[i];

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
/**
* Stable radix sorter for variable-length strings.
*
* @lucene.internal
*/
public abstract class StableMSBRadixSorter extends MSBRadixSorter {
private final int[] fixedStartOffsets;
public StableMSBRadixSorter(int maxLength) {
super(maxLength);
fixedStartOffsets = new int[HISTOGRAM_SIZE];
}
/** Save the i-th value into the j-th position in temporary storage. */
protected abstract void save(int i, int j);
/** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
protected abstract void restore(int i, int j);
@Override
protected Sorter getFallbackSorter(int k) {
return new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
StableMSBRadixSorter.this.swap(i, j);
}
@Override
protected int compare(int i, int j) {
for (int o = k; o < maxLength; ++o) {
final int b1 = byteAt(i, o);
final int b2 = byteAt(j, o);
if (b1 != b2) {
return b1 - b2;
} else if (b1 == -1) {
break;
}
}
return 0;
}
};
}
/**
* Reorder elements in stable way, since Dutch sort does not guarantee ordering for same values.
*
* <p>When this method returns, startOffsets and endOffsets are equal.
*/
@Override
protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
System.arraycopy(startOffsets, 0, fixedStartOffsets, 0, startOffsets.length);
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
final int limit = endOffsets[i];
for (int h1 = fixedStartOffsets[i]; h1 < limit; h1++) {
final int b = getBucket(from + h1, k);
final int h2 = startOffsets[b]++;
save(from + h1, from + h2);
}
}
restore(from, to);
}
}

View File

@ -21,9 +21,9 @@ import org.apache.lucene.codecs.MutablePointValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.MSBRadixSorter;
import org.apache.lucene.util.RadixSelector;
import org.apache.lucene.util.Selector;
import org.apache.lucene.util.StableMSBRadixSorter;
import org.apache.lucene.util.packed.PackedInts;
/**
@ -38,14 +38,39 @@ public final class MutablePointsReaderUtils {
/** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
public static void sort(
BKDConfig config, int maxDoc, MutablePointValues reader, int from, int to) {
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
new MSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
boolean sortedByDocID = true;
int prevDoc = 0;
for (int i = from; i < to; ++i) {
int doc = reader.getDocID(i);
if (doc < prevDoc) {
sortedByDocID = false;
break;
}
prevDoc = doc;
}
// No need to tie break on doc IDs if already sorted by doc ID, since we use a stable sort.
// This should be a common situation as IndexWriter accumulates data in doc ID order when
// index sorting is not enabled.
final int bitsPerDocId = sortedByDocID ? 0 : PackedInts.bitsRequired(maxDoc - 1);
new StableMSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
@Override
protected void swap(int i, int j) {
reader.swap(i, j);
}
@Override
protected void save(int i, int j) {
reader.save(i, j);
}
@Override
protected void restore(int i, int j) {
reader.restore(i, j);
}
@Override
protected int byteAt(int i, int k) {
if (k < config.packedBytesLength) {
@ -55,46 +80,6 @@ public final class MutablePointsReaderUtils {
return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff;
}
}
@Override
protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
return new IntroSorter() {
final BytesRef pivot = new BytesRef();
final BytesRef scratch = new BytesRef();
int pivotDoc;
@Override
protected void swap(int i, int j) {
reader.swap(i, j);
}
@Override
protected void setPivot(int i) {
reader.getValue(i, pivot);
pivotDoc = reader.getDocID(i);
}
@Override
protected int comparePivot(int j) {
if (k < config.packedBytesLength) {
reader.getValue(j, scratch);
int cmp =
Arrays.compareUnsigned(
pivot.bytes,
pivot.offset + k,
pivot.offset + k + config.packedBytesLength - k,
scratch.bytes,
scratch.offset + k,
scratch.offset + k + config.packedBytesLength - k);
if (cmp != 0) {
return cmp;
}
}
return pivotDoc - reader.getDocID(j);
}
};
}
}.sort(from, to);
}

View File

@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
public class TestStableMSBRadixSorter extends LuceneTestCase {
private void test(BytesRef[] refs, int len) {
BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len);
Arrays.sort(expected);
int maxLength = 0;
for (int i = 0; i < len; ++i) {
BytesRef ref = refs[i];
maxLength = Math.max(maxLength, ref.length);
}
switch (random().nextInt(3)) {
case 0:
maxLength += TestUtil.nextInt(random(), 1, 5);
break;
case 1:
maxLength = Integer.MAX_VALUE;
break;
default:
// leave unchanged
break;
}
final int finalMaxLength = maxLength;
new StableMSBRadixSorter(maxLength) {
private BytesRef[] temp;
@Override
protected int byteAt(int i, int k) {
assertTrue(k < finalMaxLength);
BytesRef ref = refs[i];
if (ref.length <= k) {
return -1;
}
return ref.bytes[ref.offset + k] & 0xff;
}
@Override
protected void swap(int i, int j) {
BytesRef tmp = refs[i];
refs[i] = refs[j];
refs[j] = tmp;
}
@Override
protected void save(int i, int j) {
if (temp == null) {
temp = new BytesRef[refs.length];
}
temp[j] = refs[i];
}
@Override
protected void restore(int i, int j) {
if (temp != null) {
System.arraycopy(temp, i, refs, i, j - i);
}
}
}.sort(0, len);
BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len);
assertArrayEquals(expected, actual);
// Verify that the arrays are not only equal after sorting with Arrays#sort and
// StableMSBRadixSorter
// but also that they have the very same instance at every index.
// This is different from MSBRadixSorter which does not guarantee ordering of the same value.
assertEquals(expected.length, actual.length);
for (int i = 0; i < expected.length; i++) {
assertSame(expected[i].bytes, actual[i].bytes);
}
}
public void testEmpty() {
test(new BytesRef[random().nextInt(5)], 0);
}
public void testOneValue() {
BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
test(new BytesRef[] {bytes}, 1);
}
public void testTwoValues() {
BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
test(new BytesRef[] {bytes1, bytes2}, 2);
}
private void testRandom(int commonPrefixLen, int maxLen) {
byte[] commonPrefix = new byte[commonPrefixLen];
random().nextBytes(commonPrefix);
final int len = random().nextInt(100000);
BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
for (int i = 0; i < len; ++i) {
byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
random().nextBytes(b);
System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
bytes[i] = new BytesRef(b);
}
test(bytes, len);
}
public void testRandom() {
for (int iter = 0; iter < 10; ++iter) {
testRandom(0, 10);
}
}
public void testRandomWithLotsOfDuplicates() {
for (int iter = 0; iter < 10; ++iter) {
testRandom(0, 2);
}
}
public void testRandomWithSharedPrefix() {
for (int iter = 0; iter < 10; ++iter) {
testRandom(TestUtil.nextInt(random(), 1, 30), 10);
}
}
public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
for (int iter = 0; iter < 10; ++iter) {
testRandom(TestUtil.nextInt(random(), 1, 30), 2);
}
}
public void testRandom2() {
// how large our alphabet is
int letterCount = TestUtil.nextInt(random(), 2, 10);
// how many substring fragments to use
int substringCount = TestUtil.nextInt(random(), 2, 10);
Set<BytesRef> substringsSet = new HashSet<>();
// how many strings to make
int stringCount = atLeast(10000);
// System.out.println("letterCount=" + letterCount + " substringCount=" + substringCount + "
// stringCount=" + stringCount);
while (substringsSet.size() < substringCount) {
int length = TestUtil.nextInt(random(), 2, 10);
byte[] bytes = new byte[length];
for (int i = 0; i < length; i++) {
bytes[i] = (byte) random().nextInt(letterCount);
}
BytesRef br = new BytesRef(bytes);
substringsSet.add(br);
// System.out.println("add substring count=" + substringsSet.size() + ": " + br);
}
BytesRef[] substrings = substringsSet.toArray(new BytesRef[substringsSet.size()]);
double[] chance = new double[substrings.length];
double sum = 0.0;
for (int i = 0; i < substrings.length; i++) {
chance[i] = random().nextDouble();
sum += chance[i];
}
// give each substring a random chance of occurring:
double accum = 0.0;
for (int i = 0; i < substrings.length; i++) {
accum += chance[i] / sum;
chance[i] = accum;
}
Set<BytesRef> stringsSet = new HashSet<>();
int iters = 0;
while (stringsSet.size() < stringCount && iters < stringCount * 5) {
int count = TestUtil.nextInt(random(), 1, 5);
BytesRefBuilder b = new BytesRefBuilder();
for (int i = 0; i < count; i++) {
double v = random().nextDouble();
accum = 0.0;
for (int j = 0; j < substrings.length; j++) {
accum += chance[j];
if (accum >= v) {
b.append(substrings[j]);
break;
}
}
}
BytesRef br = b.toBytesRef();
stringsSet.add(br);
// System.out.println("add string count=" + stringsSet.size() + ": " + br);
iters++;
}
test(stringsSet.toArray(new BytesRef[stringsSet.size()]), stringsSet.size());
}
}

View File

@ -1696,6 +1696,18 @@ public class TestBKD extends LuceneTestCase {
@Override
public byte getByteAt(int i, int k) {
BytesRef b = new BytesRef();
getValue(i, b);
return b.bytes[b.offset + k];
}
@Override
public void save(int i, int j) {
throw new UnsupportedOperationException();
}
@Override
public void restore(int i, int j) {
throw new UnsupportedOperationException();
}
};
@ -1839,6 +1851,16 @@ public class TestBKD extends LuceneTestCase {
public int getDocCount() {
return 11;
}
@Override
public void save(int i, int j) {
throw new UnsupportedOperationException();
}
@Override
public void restore(int i, int j) {
throw new UnsupportedOperationException();
}
};
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
IllegalStateException ex =

View File

@ -28,16 +28,22 @@ import org.apache.lucene.util.TestUtil;
public class TestMutablePointsReaderUtils extends LuceneTestCase {
public void testSort() {
for (int iter = 0; iter < 5; ++iter) {
doTestSort();
for (int iter = 0; iter < 10; ++iter) {
doTestSort(false);
}
}
private void doTestSort() {
public void testSortWithIncrementalDocId() {
for (int iter = 0; iter < 10; ++iter) {
doTestSort(true);
}
}
private void doTestSort(boolean isDocIdIncremental) {
final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
BKDConfig config = new BKDConfig(1, 1, bytesPerDim, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
Point[] points = createRandomPoints(config, maxDoc, new int[1]);
Point[] points = createRandomPoints(config, maxDoc, new int[1], isDocIdIncremental);
DummyPointsReader reader = new DummyPointsReader(points);
MutablePointsReaderUtils.sort(config, maxDoc, reader, 0, points.length);
Arrays.sort(
@ -53,7 +59,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
});
assertNotSame(points, reader.points);
assertArrayEquals(points, reader.points);
assertEquals(points.length, reader.points.length);
// Check doc IDs are in ascending order.
// If doc IDs are already increasing, StableMSBRadixSorter should keep doc ID's ordering.
// If doc IDs are not ordered, StableMSBRadixSorter should compare doc ID to guarantee the
// ordering.
Point prevPoint = null;
for (int i = 0; i < points.length; i++) {
assertEquals(points[i].packedValue, reader.points[i].packedValue);
assertSame(points[i].packedValue, reader.points[i].packedValue);
if (prevPoint != null) {
if (reader.points[i].packedValue.equals(prevPoint.packedValue)) {
assertTrue(reader.points[i].doc >= prevPoint.doc);
}
}
prevPoint = reader.points[i];
}
}
public void testSortByDim() {
@ -66,7 +88,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
BKDConfig config = createRandomConfig();
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
int[] commonPrefixLengths = new int[config.numDims];
Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(config.numIndexDims);
MutablePointsReaderUtils.sortByDim(
@ -119,7 +141,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
BKDConfig config = createRandomConfig();
int[] commonPrefixLengths = new int[config.numDims];
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
final int splitDim = random().nextInt(config.numIndexDims);
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
@ -180,15 +202,17 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private static Point[] createRandomPoints(
BKDConfig config, int maxDoc, int[] commonPrefixLengths) {
BKDConfig config, int maxDoc, int[] commonPrefixLengths, boolean isDocIdIncremental) {
assertTrue(commonPrefixLengths.length == config.numDims);
final int numPoints = TestUtil.nextInt(random(), 1, 100000);
Point[] points = new Point[numPoints];
if (random().nextInt(5) != 0) {
if (random().nextInt(10) != 0) {
for (int i = 0; i < numPoints; ++i) {
byte[] value = new byte[config.packedBytesLength];
random().nextBytes(value);
points[i] = new Point(value, random().nextInt(maxDoc));
points[i] =
new Point(
value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc));
}
for (int i = 0; i < config.numDims; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim);
@ -218,7 +242,8 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
random().nextBytes(dataDims);
System.arraycopy(
dataDims, 0, value, config.packedIndexBytesLength, numDataDims * config.bytesPerDim);
points[i] = new Point(value, random().nextInt(maxDoc));
points[i] =
new Point(value, isDocIdIncremental ? Math.min(i, maxDoc) : random().nextInt(maxDoc));
}
for (int i = 0; i < config.numIndexDims; ++i) {
commonPrefixLengths[i] = config.bytesPerDim;
@ -281,6 +306,8 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
private final Point[] points;
private Point[] temp;
DummyPointsReader(Point[] points) {
this.points = points.clone();
}
@ -352,5 +379,20 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
public int getDocCount() {
throw new UnsupportedOperationException();
}
@Override
public void save(int i, int j) {
if (temp == null) {
temp = new Point[points.length];
}
temp[j] = points[i];
}
@Override
public void restore(int i, int j) {
if (temp != null) {
System.arraycopy(temp, i, points, i, j - i);
}
}
}
}