From 3ad73336ae5ce05d9c96d32e32502383a4d856b3 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 13 Mar 2024 10:14:14 +0100 Subject: [PATCH] Make BP work on indexes that have blocks. (#13125) The current logic for reordering splits a slice of doc IDs into a left side and a right side, and for each document it computes the expected gain of moving to the other side. Then it swaps documents from both sides as long as the sum of the gain of moving the left doc to the right and the right doc to the left is positive. This works well, but I would like to extend BP reordering to also work with blocks, and the swapping logic is challenging to modify as two parent documents may have different numbers of children. One of the follow-up papers on BP suggested to use a different logic, where one would compute a bias for all documents that is negative when a document is attracted to the left and positive otherwise. Then we only have to partition doc IDs around the mid point, e.g. with quickselect. A benefit of this change is that it will make it easier to generalize BP reordering to indexes that have blocks, e.g. by using a stable sort on biases. --- lucene/CHANGES.txt | 4 +- .../lucene/misc/index/BPIndexReorderer.java | 126 ++++++++++++++++-- .../misc/index/TestBPIndexReorderer.java | 117 ++++++++++++++++ 3 files changed, 237 insertions(+), 10 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fb6a9b64bee..2d98bc55d77 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -190,7 +190,9 @@ API Changes New Features --------------------- -(No changes) + +* GITHUB#13125: Recursive graph bisection is now supported on indexes that have blocks, as long as + they configure a parent field via `IndexWriterConfig#setParentField`. (Adrien Grand) Improvements --------------------- diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java index d9c8b29caef..e34567bff48 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java @@ -26,6 +26,7 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.RecursiveAction; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; @@ -46,10 +47,12 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.store.TrackingDirectoryWrapper; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntroSelector; +import org.apache.lucene.util.IntroSorter; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.packed.PackedInts; @@ -253,16 +256,19 @@ public final class BPIndexReorderer { private final IntsRef docIDs; private final float[] biases; private final CloseableThreadLocal threadLocal; + private final BitSet parents; IndexReorderingTask( IntsRef docIDs, float[] biases, CloseableThreadLocal threadLocal, + BitSet parents, int depth) { super(depth); this.docIDs = docIDs; this.biases = biases; this.threadLocal = threadLocal; + this.parents = parents; } private static void computeDocFreqs(IntsRef docs, ForwardIndex forwardIndex, int[] docFreqs) { @@ -292,6 +298,7 @@ public final class BPIndexReorderer { } else { assert sorted(docIDs); } + assert assertParentStructure(); int halfLength = docIDs.length / 2; if (halfLength < minPartitionSize) { @@ -315,7 +322,14 @@ public final class BPIndexReorderer { try { moved = shuffle( - forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter); + forwardIndex, + docIDs, + right.offset, + leftDocFreqs, + rightDocFreqs, + biases, + parents, + iter); } catch (IOException e) { throw new UncheckedIOException(e); } @@ -324,11 +338,33 @@ public final class BPIndexReorderer { } } + if (parents != null) { + // Make sure we split just after a parent doc + int lastLeftDocID = docIDs.ints[right.offset - 1]; + int split = right.offset + parents.nextSetBit(lastLeftDocID) - lastLeftDocID; + + if (split == docIDs.offset + docIDs.length) { + // No good split on the right side, look on the left side then. + split = right.offset - (lastLeftDocID - parents.prevSetBit(lastLeftDocID)); + if (split == docIDs.offset) { + // No good split on the left side either: this slice has a single parent document, no + // reordering is possible. Stop recursing. + return; + } + } + + assert parents.get(docIDs.ints[split - 1]); + + left = new IntsRef(docIDs.ints, docIDs.offset, split - docIDs.offset); + right = new IntsRef(docIDs.ints, split, docIDs.offset + docIDs.length - split); + } + // It is fine for all tasks to share the same docs / biases array since they all work on // different slices of the array at a given point in time. - IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1); + IndexReorderingTask leftTask = + new IndexReorderingTask(left, biases, threadLocal, parents, depth + 1); IndexReorderingTask rightTask = - new IndexReorderingTask(right, biases, threadLocal, depth + 1); + new IndexReorderingTask(right, biases, threadLocal, parents, depth + 1); if (shouldFork(docIDs.length, docIDs.ints.length)) { invokeAll(leftTask, rightTask); @@ -338,6 +374,28 @@ public final class BPIndexReorderer { } } + // used for asserts + private boolean assertParentStructure() { + if (parents == null) { + return true; + } + int i = docIDs.offset; + final int end = docIDs.offset + docIDs.length; + while (i < end) { + final int firstChild = docIDs.ints[i]; + final int parent = parents.nextSetBit(firstChild); + final int numChildren = parent - firstChild; + assert i + numChildren < end; + for (int j = 1; j <= numChildren; ++j) { + assert docIDs.ints[i + j] == firstChild + j : "Parent structure has not been preserved"; + } + i += numChildren + 1; + } + assert i == end : "Last doc ID must be a parent doc"; + + return true; + } + /** * Shuffle doc IDs across both partitions so that each partition has lower gaps between * consecutive postings. @@ -349,6 +407,7 @@ public final class BPIndexReorderer { int[] leftDocFreqs, int[] rightDocFreqs, float[] biases, + BitSet parents, int iter) throws IOException { @@ -366,6 +425,23 @@ public final class BPIndexReorderer { depth) .compute(); + if (parents != null) { + for (int i = docIDs.offset, end = docIDs.offset + docIDs.length; i < end; ) { + final int firstChild = docIDs.ints[i]; + final int numChildren = parents.nextSetBit(firstChild) - firstChild; + assert parents.get(docIDs.ints[i + numChildren]); + double cumulativeBias = 0; + for (int j = 0; j <= numChildren; ++j) { + cumulativeBias += biases[i + j]; + } + // Give all docs from the same block the same bias, which is the sum of biases of all + // documents in the block. This helps ensure that the follow-up sort() call preserves the + // block structure. + Arrays.fill(biases, i, i + numChildren + 1, (float) cumulativeBias); + i += numChildren + 1; + } + } + float maxLeftBias = Float.NEGATIVE_INFINITY; for (int i = docIDs.offset; i < midPoint; ++i) { maxLeftBias = Math.max(maxLeftBias, biases[i]); @@ -383,19 +459,19 @@ public final class BPIndexReorderer { return false; } - new IntroSelector() { + class Selector extends IntroSelector { int pivotDoc; float pivotBias; @Override - protected void setPivot(int i) { + public void setPivot(int i) { pivotDoc = docIDs.ints[i]; pivotBias = biases[i]; } @Override - protected int comparePivot(int j) { + public int comparePivot(int j) { int cmp = Float.compare(pivotBias, biases[j]); if (cmp == 0) { // Tie break on the doc ID to preserve doc ID ordering as much as possible @@ -405,7 +481,7 @@ public final class BPIndexReorderer { } @Override - protected void swap(int i, int j) { + public void swap(int i, int j) { float tmpBias = biases[i]; biases[i] = biases[j]; biases[j] = tmpBias; @@ -426,7 +502,32 @@ public final class BPIndexReorderer { } } } - }.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint); + } + + Selector selector = new Selector(); + + if (parents == null) { + selector.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint); + } else { + // When we have parents, we need to do a full sort to make sure we're not breaking the + // parent structure. + new IntroSorter() { + @Override + protected void setPivot(int i) { + selector.setPivot(i); + } + + @Override + protected int comparePivot(int j) { + return selector.comparePivot(j); + } + + @Override + protected void swap(int i, int j) { + selector.swap(i, j); + } + }.sort(docIDs.offset, docIDs.offset + docIDs.length); + } return true; } @@ -809,6 +910,12 @@ public final class BPIndexReorderer { sortedDocs[i] = i; } + BitSet parents = null; + String parentField = reader.getFieldInfos().getParentField(); + if (parentField != null) { + parents = BitSet.of(DocValues.getNumeric(reader, parentField), maxDoc); + } + try (CloseableThreadLocal threadLocal = new CloseableThreadLocal<>() { @Override @@ -817,7 +924,8 @@ public final class BPIndexReorderer { } }) { IntsRef docs = new IntsRef(sortedDocs, 0, sortedDocs.length); - IndexReorderingTask task = new IndexReorderingTask(docs, new float[maxDoc], threadLocal, 0); + IndexReorderingTask task = + new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, 0); if (forkJoinPool != null) { forkJoinPool.execute(task); task.join(); diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java index 491a96185ed..f4322ff600a 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java @@ -137,6 +137,123 @@ public class TestBPIndexReorderer extends LuceneTestCase { dir.close(); } + public void testSingleTermWithBlocks() throws IOException { + doTestSingleTermWithBlocks(null); + } + + public void testSingleTermWithBlocksAndForkJoinPool() throws IOException { + int concurrency = TestUtil.nextInt(random(), 1, 8); + // The default ForkJoinPool implementation uses a thread factory that removes all permissions on + // threads, so we need to create our own to avoid tests failing with FS-based directories. + ForkJoinPool pool = + new ForkJoinPool( + concurrency, p -> new ForkJoinWorkerThread(p) {}, null, random().nextBoolean()); + try { + doTestSingleTermWithBlocks(pool); + } finally { + pool.shutdown(); + } + } + + private void doTestSingleTermWithBlocks(ForkJoinPool pool) throws IOException { + Directory dir = newDirectory(); + IndexWriter w = + new IndexWriter( + dir, + newIndexWriterConfig() + .setParentField("parent") + .setMergePolicy(newLogMergePolicy(random().nextBoolean()))); + + w.addDocuments(createBlock("1", "lucene", "search", "lucene")); // 0-2 + w.addDocuments(createBlock("2", "lucene")); // 3 + w.addDocuments(createBlock("3", "search", "lucene", "search")); // 4-6 + w.addDocuments(createBlock("4", "lucene", "lucene", "search")); // 7-9 + w.addDocuments(createBlock("5", "lucene", "lucene", "lucene", "lucene")); // 10-13 + w.addDocuments(createBlock("6", "search", "search", "search")); // 14-16 + w.addDocuments(createBlock("7", "search", "lucene", "search")); // 17-19 + w.addDocuments(createBlock("8", "search", "search")); // 20-21 + + w.forceMerge(1); + + DirectoryReader reader = DirectoryReader.open(w); + LeafReader leafRealer = getOnlyLeafReader(reader); + CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafRealer); + + BPIndexReorderer reorderer = new BPIndexReorderer(); + reorderer.setForkJoinPool(pool); + reorderer.setMinDocFreq(2); + reorderer.setMinPartitionSize(1); + reorderer.setMaxIters(10); + CodecReader reordered = reorderer.reorder(codecReader, dir); + StoredFields storedFields = reordered.storedFields(); + + assertEquals("2", storedFields.document(0).get("id")); + + assertEquals("1", storedFields.document(1).get("parent_id")); + assertEquals("0", storedFields.document(1).get("child_id")); + assertEquals("1", storedFields.document(2).get("parent_id")); + assertEquals("1", storedFields.document(2).get("child_id")); + assertEquals("1", storedFields.document(3).get("id")); + + assertEquals("4", storedFields.document(4).get("parent_id")); + assertEquals("0", storedFields.document(4).get("child_id")); + assertEquals("4", storedFields.document(5).get("parent_id")); + assertEquals("1", storedFields.document(5).get("child_id")); + assertEquals("4", storedFields.document(6).get("id")); + + assertEquals("5", storedFields.document(7).get("parent_id")); + assertEquals("0", storedFields.document(7).get("child_id")); + assertEquals("5", storedFields.document(8).get("parent_id")); + assertEquals("1", storedFields.document(8).get("child_id")); + assertEquals("5", storedFields.document(9).get("parent_id")); + assertEquals("2", storedFields.document(9).get("child_id")); + assertEquals("5", storedFields.document(10).get("id")); + + assertEquals("3", storedFields.document(11).get("parent_id")); + assertEquals("0", storedFields.document(11).get("child_id")); + assertEquals("3", storedFields.document(12).get("parent_id")); + assertEquals("1", storedFields.document(12).get("child_id")); + assertEquals("3", storedFields.document(13).get("id")); + + assertEquals("7", storedFields.document(14).get("parent_id")); + assertEquals("0", storedFields.document(14).get("child_id")); + assertEquals("7", storedFields.document(15).get("parent_id")); + assertEquals("1", storedFields.document(15).get("child_id")); + assertEquals("7", storedFields.document(16).get("id")); + + assertEquals("8", storedFields.document(17).get("parent_id")); + assertEquals("0", storedFields.document(17).get("child_id")); + assertEquals("8", storedFields.document(18).get("id")); + + assertEquals("6", storedFields.document(19).get("parent_id")); + assertEquals("0", storedFields.document(19).get("child_id")); + assertEquals("6", storedFields.document(20).get("parent_id")); + assertEquals("1", storedFields.document(20).get("child_id")); + assertEquals("6", storedFields.document(21).get("id")); + + reader.close(); + w.close(); + dir.close(); + } + + private List createBlock(String parentID, String... values) { + List docs = new ArrayList<>(); + for (int i = 0; i < values.length - 1; ++i) { + Document doc = new Document(); + doc.add(new StoredField("parent_id", parentID)); + doc.add(new StoredField("child_id", Integer.toString(i))); + doc.add(new StringField("body", values[i], Store.NO)); + docs.add(doc); + } + + Document parentDoc = new Document(); + parentDoc.add(new StoredField("id", parentID)); + parentDoc.add(new StringField("body", values[values.length - 1], Store.NO)); + docs.add(parentDoc); + + return docs; + } + public void testMultiTerm() throws IOException { Directory dir = newDirectory(); IndexWriter w =