Fix issues with chunked TaxonomyIndexArray (#13028)

Fix construction from index with multiple of chunk size ordinals.

Fix mutable post-refresh children array.
This commit is contained in:
Stefan Vodita 2024-01-25 07:53:14 +00:00 committed by GitHub
parent d0098f8489
commit 7552c5093f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 208 additions and 12 deletions

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.facet.taxonomy.directory;
import com.carrotsearch.hppc.IntHashSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -80,7 +81,8 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
public TaxonomyIndexArrays(IndexReader reader) throws IOException {
int[][] parentArray = allocateChunkedArray(reader.maxDoc(), 0);
if (parentArray.length > 0) {
assert parentArray.length > 0;
if (parentArray[0].length > 0) {
initParents(parentArray, reader, 0);
parentArray[0][0] = TaxonomyReader.INVALID_ORDINAL;
}
@ -95,10 +97,10 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
// NRT reader was obtained, even though nothing was changed. this is not very likely
// to happen.
int[][] parentArray = allocateChunkedArray(reader.maxDoc(), copyFrom.parents.values.length - 1);
if (parentArray.length > 0) {
copyChunkedArray(copyFrom.parents.values, parentArray);
initParents(parentArray, reader, copyFrom.parents.length());
}
assert parentArray.length > 0;
copyChunkedArray(copyFrom.parents.values, parentArray);
initParents(parentArray, reader, copyFrom.parents.length());
parents = new ChunkedIntArray(parentArray);
if (copyFrom.initializedChildren) {
initChildrenSiblings(copyFrom);
@ -164,6 +166,8 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
siblings.set(0, TaxonomyReader.INVALID_ORDINAL);
}
int firstChunkStart = first - (first & CHUNK_MASK);
IntHashSet reallocatedChildChunks = new IntHashSet();
for (int i = first; i < length; i++) {
int parent = parents.get(i);
// The existing youngest child of the parent is the next older sibling of i.
@ -171,6 +175,15 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
// the following line is already set when we get here
siblings.set(i, children.get(parent));
// The new youngest child of the parent is i.
if (parent < firstChunkStart) {
int chunkIdx = parent >> CHUNK_SIZE_BITS;
if (reallocatedChildChunks.contains(chunkIdx) == false) {
reallocatedChildChunks.add(chunkIdx);
int[] oldChildren = children.values[chunkIdx];
children.values[chunkIdx] = new int[CHUNK_SIZE];
System.arraycopy(oldChildren, 0, children.values[chunkIdx], 0, oldChildren.length);
}
}
children.set(parent, i);
}
}

View File

@ -17,21 +17,36 @@
*/
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestTaxonomyIndexArrays extends LuceneTestCase {
private void checkIntArraysEquals(
TaxonomyIndexArrays.ChunkedIntArray expected, TaxonomyIndexArrays.ChunkedIntArray actual) {
for (int i = 0; i < expected.values.length - 1; i++) {
assertSame(expected.values[i], actual.values[i]);
}
int lastOldChunk = expected.values.length - 1;
for (int i = 0; i < expected.values[lastOldChunk].length; i++) {
assertEquals(expected.values[lastOldChunk][i], actual.values[lastOldChunk][i]);
}
}
private void checkInvariants(TaxonomyIndexArrays oldArray, TaxonomyIndexArrays newArray) {
TaxonomyIndexArrays.ChunkedIntArray oldParents = oldArray.parents();
TaxonomyIndexArrays.ChunkedIntArray newParents = newArray.parents();
for (int i = 0; i < oldParents.values.length - 1; i++) {
assertSame(oldParents.values[i], newParents.values[i]);
}
int lastOldChunk = oldParents.values.length - 1;
for (int i = 0; i < oldParents.values[lastOldChunk].length; i++) {
assertEquals(oldParents.values[lastOldChunk][i], newParents.values[lastOldChunk][i]);
}
checkIntArraysEquals(oldParents, newParents);
TaxonomyIndexArrays.ChunkedIntArray oldSiblings = oldArray.siblings();
TaxonomyIndexArrays.ChunkedIntArray newSiblings = newArray.siblings();
checkIntArraysEquals(oldSiblings, newSiblings);
}
public void testRandom() {
@ -59,4 +74,172 @@ public class TestTaxonomyIndexArrays extends LuceneTestCase {
ordinal = newOrdinal;
}
}
public void testConstructFromEmptyIndex() throws IOException {
Directory dir = newDirectory();
// Produce empty index
new IndexWriter(dir, newIndexWriterConfig(null)).close();
IndexReader reader = DirectoryReader.open(dir);
TaxonomyIndexArrays tia = new TaxonomyIndexArrays(reader);
assertEquals(0, tia.parents().length());
tia = new TaxonomyIndexArrays(reader, tia);
assertEquals(0, tia.parents().length());
reader.close();
dir.close();
}
public void testConstructFromIndex() throws IOException {
Directory dir = newDirectory();
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
TaxonomyIndexArrays oldTia, newTia;
TaxonomyIndexArrays.ChunkedIntArray oldParents, newParents;
// Test 1
// Add one category. The first TIA will have a non-zero length incomplete chunk.
taxoWriter.addCategory(new FacetLabel("a"));
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
oldTia = new TaxonomyIndexArrays(reader);
}
oldParents = oldTia.parents();
assertEquals(2, oldParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, oldParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, oldParents.get(1));
// Test 2
// Add enough categories to fill the first chunk.
for (int i = 2; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
}
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
oldTia = new TaxonomyIndexArrays(reader);
}
oldParents = oldTia.parents();
assertEquals(TaxonomyIndexArrays.CHUNK_SIZE, oldParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, oldParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, oldParents.get(1));
for (int i = 2; i < oldParents.length(); i++) {
assertEquals(1, oldParents.get(i));
}
// Test 3
// Both TIAs have the same parents and siblings arrays.
oldTia.children(); // Initializes children
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}
checkInvariants(oldTia, newTia);
// Test 4
// Add one more category, which will start a new chunk on the new TIA.
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(TaxonomyIndexArrays.CHUNK_SIZE)));
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}
newParents = newTia.parents();
assertEquals(1 + TaxonomyIndexArrays.CHUNK_SIZE, newParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, newParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, newParents.get(1));
for (int i = 2; i < newParents.length(); i++) {
assertEquals(1, newParents.get(i));
}
// Test 5
// Fill the second chunk of the new TIA.
for (int i = 1; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
taxoWriter.addCategory(
new FacetLabel("a", Integer.toString(i + TaxonomyIndexArrays.CHUNK_SIZE)));
}
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}
newParents = newTia.parents();
assertEquals(2 * TaxonomyIndexArrays.CHUNK_SIZE, newParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, newParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, newParents.get(1));
for (int i = 2; i < newParents.length(); i++) {
assertEquals(1, newParents.get(i));
}
taxoWriter.close();
dir.close();
}
public void testRefresh() throws IOException {
Directory dir = newDirectory();
// Write two chunks worth of ordinals whose parents are ordinals 1 or 2
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
taxoWriter.addCategory(new FacetLabel("a")); // ordinal 1
taxoWriter.addCategory(new FacetLabel("b")); // ordinal 2
for (int i = 0; i < 2 * TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i % 2 == 0) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
} else {
taxoWriter.addCategory(new FacetLabel("b", Integer.toString(i)));
}
}
taxoWriter.commit();
// Initialize old taxonomy arrays
IndexReader reader = DirectoryReader.open(dir);
TaxonomyIndexArrays oldTia = new TaxonomyIndexArrays(reader);
reader.close();
oldTia.children(); // Init the children
// Write one more batch of ordinals whose parents are ordinals 1 or 2 again.
for (int i = 2 * TaxonomyIndexArrays.CHUNK_SIZE; i < 3 * TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i % 2 == 0) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
} else {
taxoWriter.addCategory(new FacetLabel("b", Integer.toString(i)));
}
}
taxoWriter.close();
// Initialize new taxonomy arrays
reader = DirectoryReader.open(dir);
TaxonomyIndexArrays newTia = new TaxonomyIndexArrays(reader, oldTia);
reader.close();
// Parents and siblings are unchanged in the old range, children will have changed
checkInvariants(oldTia, newTia);
TaxonomyIndexArrays.ChunkedIntArray oldChildren = oldTia.children();
TaxonomyIndexArrays.ChunkedIntArray newChildren = newTia.children();
// The first chunk had to be reallocated to rewrite the value for ordinals 1 and 2
assertNotSame(oldChildren.values[0], newChildren.values[0]);
// The second chunk could stay the same, since none of these ordinals have children
assertSame(oldChildren.values[1], newChildren.values[1]);
// The third chunk had to be reallocated to grow
assertNotSame(oldChildren.values[2], newChildren.values[2]);
// Check contents of the first chunk
for (int i = 0; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i == 1 || i == 2) {
assertNotEquals(oldChildren.values[0][i], newChildren.values[0][i]);
} else {
assertEquals(oldChildren.values[0][i], newChildren.values[0][i]);
}
}
// Check contents of the third chunk
for (int i = 0; i < oldChildren.values[2].length; i++) {
assertEquals(oldChildren.values[2][i], newChildren.values[2][i]);
}
dir.close();
}
}