Make BP work on indexes that have blocks. (#13125)

The current logic for reordering splits a slice of doc IDs into a left side and
a right side, and for each document it computes the expected gain of moving to
the other side. Then it swaps documents from both sides as long as the sum of
the gain of moving the left doc to the right and the right doc to the left is
positive.

This works well, but I would like to extend BP reordering to also work with
blocks, and the swapping logic is challenging to modify as two parent documents
may have different numbers of children.

One of the follow-up papers on BP suggested to use a different logic, where one
would compute a bias for all documents that is negative when a document is
attracted to the left and positive otherwise. Then we only have to partition doc
IDs around the mid point, e.g. with quickselect.

A benefit of this change is that it will make it easier to generalize BP
reordering to indexes that have blocks, e.g. by using a stable sort on biases.
This commit is contained in:
Adrien Grand 2024-03-13 10:14:14 +01:00 committed by GitHub
parent d5ade0d30f
commit 3ad73336ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 237 additions and 10 deletions

View File

@ -190,7 +190,9 @@ API Changes
New Features
---------------------
(No changes)
* GITHUB#13125: Recursive graph bisection is now supported on indexes that have blocks, as long as
they configure a parent field via `IndexWriterConfig#setParentField`. (Adrien Grand)
Improvements
---------------------

View File

@ -26,6 +26,7 @@ import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveAction;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
@ -46,10 +47,12 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.packed.PackedInts;
@ -253,16 +256,19 @@ public final class BPIndexReorderer {
private final IntsRef docIDs;
private final float[] biases;
private final CloseableThreadLocal<PerThreadState> threadLocal;
private final BitSet parents;
IndexReorderingTask(
IntsRef docIDs,
float[] biases,
CloseableThreadLocal<PerThreadState> threadLocal,
BitSet parents,
int depth) {
super(depth);
this.docIDs = docIDs;
this.biases = biases;
this.threadLocal = threadLocal;
this.parents = parents;
}
private static void computeDocFreqs(IntsRef docs, ForwardIndex forwardIndex, int[] docFreqs) {
@ -292,6 +298,7 @@ public final class BPIndexReorderer {
} else {
assert sorted(docIDs);
}
assert assertParentStructure();
int halfLength = docIDs.length / 2;
if (halfLength < minPartitionSize) {
@ -315,7 +322,14 @@ public final class BPIndexReorderer {
try {
moved =
shuffle(
forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter);
forwardIndex,
docIDs,
right.offset,
leftDocFreqs,
rightDocFreqs,
biases,
parents,
iter);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
@ -324,11 +338,33 @@ public final class BPIndexReorderer {
}
}
if (parents != null) {
// Make sure we split just after a parent doc
int lastLeftDocID = docIDs.ints[right.offset - 1];
int split = right.offset + parents.nextSetBit(lastLeftDocID) - lastLeftDocID;
if (split == docIDs.offset + docIDs.length) {
// No good split on the right side, look on the left side then.
split = right.offset - (lastLeftDocID - parents.prevSetBit(lastLeftDocID));
if (split == docIDs.offset) {
// No good split on the left side either: this slice has a single parent document, no
// reordering is possible. Stop recursing.
return;
}
}
assert parents.get(docIDs.ints[split - 1]);
left = new IntsRef(docIDs.ints, docIDs.offset, split - docIDs.offset);
right = new IntsRef(docIDs.ints, split, docIDs.offset + docIDs.length - split);
}
// It is fine for all tasks to share the same docs / biases array since they all work on
// different slices of the array at a given point in time.
IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1);
IndexReorderingTask leftTask =
new IndexReorderingTask(left, biases, threadLocal, parents, depth + 1);
IndexReorderingTask rightTask =
new IndexReorderingTask(right, biases, threadLocal, depth + 1);
new IndexReorderingTask(right, biases, threadLocal, parents, depth + 1);
if (shouldFork(docIDs.length, docIDs.ints.length)) {
invokeAll(leftTask, rightTask);
@ -338,6 +374,28 @@ public final class BPIndexReorderer {
}
}
// used for asserts
private boolean assertParentStructure() {
if (parents == null) {
return true;
}
int i = docIDs.offset;
final int end = docIDs.offset + docIDs.length;
while (i < end) {
final int firstChild = docIDs.ints[i];
final int parent = parents.nextSetBit(firstChild);
final int numChildren = parent - firstChild;
assert i + numChildren < end;
for (int j = 1; j <= numChildren; ++j) {
assert docIDs.ints[i + j] == firstChild + j : "Parent structure has not been preserved";
}
i += numChildren + 1;
}
assert i == end : "Last doc ID must be a parent doc";
return true;
}
/**
* Shuffle doc IDs across both partitions so that each partition has lower gaps between
* consecutive postings.
@ -349,6 +407,7 @@ public final class BPIndexReorderer {
int[] leftDocFreqs,
int[] rightDocFreqs,
float[] biases,
BitSet parents,
int iter)
throws IOException {
@ -366,6 +425,23 @@ public final class BPIndexReorderer {
depth)
.compute();
if (parents != null) {
for (int i = docIDs.offset, end = docIDs.offset + docIDs.length; i < end; ) {
final int firstChild = docIDs.ints[i];
final int numChildren = parents.nextSetBit(firstChild) - firstChild;
assert parents.get(docIDs.ints[i + numChildren]);
double cumulativeBias = 0;
for (int j = 0; j <= numChildren; ++j) {
cumulativeBias += biases[i + j];
}
// Give all docs from the same block the same bias, which is the sum of biases of all
// documents in the block. This helps ensure that the follow-up sort() call preserves the
// block structure.
Arrays.fill(biases, i, i + numChildren + 1, (float) cumulativeBias);
i += numChildren + 1;
}
}
float maxLeftBias = Float.NEGATIVE_INFINITY;
for (int i = docIDs.offset; i < midPoint; ++i) {
maxLeftBias = Math.max(maxLeftBias, biases[i]);
@ -383,19 +459,19 @@ public final class BPIndexReorderer {
return false;
}
new IntroSelector() {
class Selector extends IntroSelector {
int pivotDoc;
float pivotBias;
@Override
protected void setPivot(int i) {
public void setPivot(int i) {
pivotDoc = docIDs.ints[i];
pivotBias = biases[i];
}
@Override
protected int comparePivot(int j) {
public int comparePivot(int j) {
int cmp = Float.compare(pivotBias, biases[j]);
if (cmp == 0) {
// Tie break on the doc ID to preserve doc ID ordering as much as possible
@ -405,7 +481,7 @@ public final class BPIndexReorderer {
}
@Override
protected void swap(int i, int j) {
public void swap(int i, int j) {
float tmpBias = biases[i];
biases[i] = biases[j];
biases[j] = tmpBias;
@ -426,7 +502,32 @@ public final class BPIndexReorderer {
}
}
}
}.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
}
Selector selector = new Selector();
if (parents == null) {
selector.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
} else {
// When we have parents, we need to do a full sort to make sure we're not breaking the
// parent structure.
new IntroSorter() {
@Override
protected void setPivot(int i) {
selector.setPivot(i);
}
@Override
protected int comparePivot(int j) {
return selector.comparePivot(j);
}
@Override
protected void swap(int i, int j) {
selector.swap(i, j);
}
}.sort(docIDs.offset, docIDs.offset + docIDs.length);
}
return true;
}
@ -809,6 +910,12 @@ public final class BPIndexReorderer {
sortedDocs[i] = i;
}
BitSet parents = null;
String parentField = reader.getFieldInfos().getParentField();
if (parentField != null) {
parents = BitSet.of(DocValues.getNumeric(reader, parentField), maxDoc);
}
try (CloseableThreadLocal<PerThreadState> threadLocal =
new CloseableThreadLocal<>() {
@Override
@ -817,7 +924,8 @@ public final class BPIndexReorderer {
}
}) {
IntsRef docs = new IntsRef(sortedDocs, 0, sortedDocs.length);
IndexReorderingTask task = new IndexReorderingTask(docs, new float[maxDoc], threadLocal, 0);
IndexReorderingTask task =
new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, 0);
if (forkJoinPool != null) {
forkJoinPool.execute(task);
task.join();

View File

@ -137,6 +137,123 @@ public class TestBPIndexReorderer extends LuceneTestCase {
dir.close();
}
public void testSingleTermWithBlocks() throws IOException {
doTestSingleTermWithBlocks(null);
}
public void testSingleTermWithBlocksAndForkJoinPool() throws IOException {
int concurrency = TestUtil.nextInt(random(), 1, 8);
// The default ForkJoinPool implementation uses a thread factory that removes all permissions on
// threads, so we need to create our own to avoid tests failing with FS-based directories.
ForkJoinPool pool =
new ForkJoinPool(
concurrency, p -> new ForkJoinWorkerThread(p) {}, null, random().nextBoolean());
try {
doTestSingleTermWithBlocks(pool);
} finally {
pool.shutdown();
}
}
private void doTestSingleTermWithBlocks(ForkJoinPool pool) throws IOException {
Directory dir = newDirectory();
IndexWriter w =
new IndexWriter(
dir,
newIndexWriterConfig()
.setParentField("parent")
.setMergePolicy(newLogMergePolicy(random().nextBoolean())));
w.addDocuments(createBlock("1", "lucene", "search", "lucene")); // 0-2
w.addDocuments(createBlock("2", "lucene")); // 3
w.addDocuments(createBlock("3", "search", "lucene", "search")); // 4-6
w.addDocuments(createBlock("4", "lucene", "lucene", "search")); // 7-9
w.addDocuments(createBlock("5", "lucene", "lucene", "lucene", "lucene")); // 10-13
w.addDocuments(createBlock("6", "search", "search", "search")); // 14-16
w.addDocuments(createBlock("7", "search", "lucene", "search")); // 17-19
w.addDocuments(createBlock("8", "search", "search")); // 20-21
w.forceMerge(1);
DirectoryReader reader = DirectoryReader.open(w);
LeafReader leafRealer = getOnlyLeafReader(reader);
CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafRealer);
BPIndexReorderer reorderer = new BPIndexReorderer();
reorderer.setForkJoinPool(pool);
reorderer.setMinDocFreq(2);
reorderer.setMinPartitionSize(1);
reorderer.setMaxIters(10);
CodecReader reordered = reorderer.reorder(codecReader, dir);
StoredFields storedFields = reordered.storedFields();
assertEquals("2", storedFields.document(0).get("id"));
assertEquals("1", storedFields.document(1).get("parent_id"));
assertEquals("0", storedFields.document(1).get("child_id"));
assertEquals("1", storedFields.document(2).get("parent_id"));
assertEquals("1", storedFields.document(2).get("child_id"));
assertEquals("1", storedFields.document(3).get("id"));
assertEquals("4", storedFields.document(4).get("parent_id"));
assertEquals("0", storedFields.document(4).get("child_id"));
assertEquals("4", storedFields.document(5).get("parent_id"));
assertEquals("1", storedFields.document(5).get("child_id"));
assertEquals("4", storedFields.document(6).get("id"));
assertEquals("5", storedFields.document(7).get("parent_id"));
assertEquals("0", storedFields.document(7).get("child_id"));
assertEquals("5", storedFields.document(8).get("parent_id"));
assertEquals("1", storedFields.document(8).get("child_id"));
assertEquals("5", storedFields.document(9).get("parent_id"));
assertEquals("2", storedFields.document(9).get("child_id"));
assertEquals("5", storedFields.document(10).get("id"));
assertEquals("3", storedFields.document(11).get("parent_id"));
assertEquals("0", storedFields.document(11).get("child_id"));
assertEquals("3", storedFields.document(12).get("parent_id"));
assertEquals("1", storedFields.document(12).get("child_id"));
assertEquals("3", storedFields.document(13).get("id"));
assertEquals("7", storedFields.document(14).get("parent_id"));
assertEquals("0", storedFields.document(14).get("child_id"));
assertEquals("7", storedFields.document(15).get("parent_id"));
assertEquals("1", storedFields.document(15).get("child_id"));
assertEquals("7", storedFields.document(16).get("id"));
assertEquals("8", storedFields.document(17).get("parent_id"));
assertEquals("0", storedFields.document(17).get("child_id"));
assertEquals("8", storedFields.document(18).get("id"));
assertEquals("6", storedFields.document(19).get("parent_id"));
assertEquals("0", storedFields.document(19).get("child_id"));
assertEquals("6", storedFields.document(20).get("parent_id"));
assertEquals("1", storedFields.document(20).get("child_id"));
assertEquals("6", storedFields.document(21).get("id"));
reader.close();
w.close();
dir.close();
}
private List<Document> createBlock(String parentID, String... values) {
List<Document> docs = new ArrayList<>();
for (int i = 0; i < values.length - 1; ++i) {
Document doc = new Document();
doc.add(new StoredField("parent_id", parentID));
doc.add(new StoredField("child_id", Integer.toString(i)));
doc.add(new StringField("body", values[i], Store.NO));
docs.add(doc);
}
Document parentDoc = new Document();
parentDoc.add(new StoredField("id", parentID));
parentDoc.add(new StringField("body", values[values.length - 1], Store.NO));
docs.add(parentDoc);
return docs;
}
public void testMultiTerm() throws IOException {
Directory dir = newDirectory();
IndexWriter w =