Fix: Lucene90DocValuesProducer.TermsDict.seekCeil doesn't always position bytes correctly (#12555)

This commit is contained in:
Egor Potemkin 2023-09-17 00:48:45 +01:00 committed by GitHub
parent 43c0d72b94
commit d633c9b7d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 82 additions and 14 deletions

View File

@ -233,6 +233,9 @@ Bug Fixes
* LUCENE-12521: Sort After returning in-correct result when missing values are competitive. (Chaitanya Gohel)
* GITHUB#12555: Fix bug in TermsEnum#seekCeil on doc values terms enums
that causes IndexOutOfBoundsException. (Egor Potemkin)
Other
---------------------

View File

@ -1153,6 +1153,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0;
assert hi == ((entry.termsDictSize - 1) >> entry.termsDictIndexShift)
|| getTermFromIndex(hi + 1).compareTo(text) > 0;
assert hi < 0 ^ entry.termsDictSize > 0; // return -1 iff empty term dict
return hi;
}
@ -1169,7 +1170,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
private long seekBlock(BytesRef text) throws IOException {
long index = seekTermsIndex(text);
if (index == -1L) {
return -1L;
// empty terms dict
this.ord = 0;
return -2L;
}
long ordLo = index << entry.termsDictIndexShift;
@ -1193,26 +1196,30 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
assert blockHi == ((entry.termsDictSize - 1) >>> TERMS_DICT_BLOCK_LZ4_SHIFT)
|| getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0;
// read the block only if term dict is not empty
assert entry.termsDictSize > 0;
// reset ord and bytes to the ceiling block even if
// text is before the first term (blockHi == -1)
final long block = Math.max(blockHi, 0);
final long blockAddress = blockAddresses.get(block);
this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
bytes.seek(blockAddress);
decompressBlock();
return blockHi;
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
final long block = seekBlock(text);
if (block == -1) {
// before the first term, or empty terms dict
if (entry.termsDictSize == 0) {
ord = 0;
return SeekStatus.END;
} else {
seekExact(0L);
return SeekStatus.NOT_FOUND;
}
if (block == -2) {
// empty terms dict
assert entry.termsDictSize == 0;
return SeekStatus.END;
} else if (block == -1) {
// before the first term
return SeekStatus.NOT_FOUND;
}
final long blockAddress = blockAddresses.get(block);
this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
bytes.seek(blockAddress);
decompressBlock();
while (true) {
int cmp = term.compareTo(text);

View File

@ -24,6 +24,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
@ -958,4 +959,61 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
reader.close();
directory.close();
}
// Testing termsEnum seekCeil edge case, where inconsistent internal state led to
// IndexOutOfBoundsException
// see https://github.com/apache/lucene/pull/12555 for details
public void testTermsEnumConsistency() throws IOException {
int numTerms =
Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE
+ 10; // need more than one block of unique terms.
Directory directory = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig();
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
Document doc = new Document();
// for simplicity, we will generate sorted list of terms which are a) unique b) all greater than
// the term that we want to use for the test
char termA = 'A';
Function<Integer, String> stringSupplier =
(Integer n) -> {
assert n < 25 * 25;
char[] chars = new char[] {(char) (termA + 1 + n / 25), (char) (termA + 1 + n % 25)};
return new String(chars);
};
SortedDocValuesField field =
new SortedDocValuesField("field", new BytesRef(stringSupplier.apply(0)));
doc.add(field);
iwriter.addDocument(doc);
for (int i = 1; i < numTerms; i++) {
field.setBytesValue(new BytesRef(stringSupplier.apply(i)));
iwriter.addDocument(doc);
}
// merging to one segment to make sure we have more than one block (TERMS_DICT_BLOCK_LZ4_SIZE)
// in a segment, to trigger next block decompression.
iwriter.forceMerge(1);
iwriter.close();
IndexReader reader = DirectoryReader.open(directory);
LeafReader leafReader = getOnlyLeafReader(reader);
SortedDocValues values = leafReader.getSortedDocValues("field");
TermsEnum termsEnum = values.termsEnum();
// Position terms enum at 0
termsEnum.seekExact(0L);
assertEquals(0, termsEnum.ord());
// seekCeil to a term which doesn't exist in the index
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("A")));
// ... and before any other term in the index
assertEquals(0, termsEnum.ord());
assertEquals(new BytesRef(stringSupplier.apply(0)), termsEnum.term());
// read more than one block of terms to trigger next block decompression
for (int i = 1; i < numTerms; i++) {
assertEquals(new BytesRef(stringSupplier.apply(i)), termsEnum.next());
}
assertNull(termsEnum.next());
reader.close();
directory.close();
}
}