mirror of https://github.com/apache/lucene.git
Fix: Lucene90DocValuesProducer.TermsDict.seekCeil doesn't always position bytes correctly (#12555)
This commit is contained in:
parent
43c0d72b94
commit
d633c9b7d4
|
@ -233,6 +233,9 @@ Bug Fixes
|
|||
|
||||
* LUCENE-12521: Sort After returning in-correct result when missing values are competitive. (Chaitanya Gohel)
|
||||
|
||||
* GITHUB#12555: Fix bug in TermsEnum#seekCeil on doc values terms enums
|
||||
that causes IndexOutOfBoundsException. (Egor Potemkin)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -1153,6 +1153,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0;
|
||||
assert hi == ((entry.termsDictSize - 1) >> entry.termsDictIndexShift)
|
||||
|| getTermFromIndex(hi + 1).compareTo(text) > 0;
|
||||
assert hi < 0 ^ entry.termsDictSize > 0; // return -1 iff empty term dict
|
||||
|
||||
return hi;
|
||||
}
|
||||
|
@ -1169,7 +1170,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
private long seekBlock(BytesRef text) throws IOException {
|
||||
long index = seekTermsIndex(text);
|
||||
if (index == -1L) {
|
||||
return -1L;
|
||||
// empty terms dict
|
||||
this.ord = 0;
|
||||
return -2L;
|
||||
}
|
||||
|
||||
long ordLo = index << entry.termsDictIndexShift;
|
||||
|
@ -1193,26 +1196,30 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
assert blockHi == ((entry.termsDictSize - 1) >>> TERMS_DICT_BLOCK_LZ4_SHIFT)
|
||||
|| getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0;
|
||||
|
||||
// read the block only if term dict is not empty
|
||||
assert entry.termsDictSize > 0;
|
||||
// reset ord and bytes to the ceiling block even if
|
||||
// text is before the first term (blockHi == -1)
|
||||
final long block = Math.max(blockHi, 0);
|
||||
final long blockAddress = blockAddresses.get(block);
|
||||
this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
bytes.seek(blockAddress);
|
||||
decompressBlock();
|
||||
|
||||
return blockHi;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||
final long block = seekBlock(text);
|
||||
if (block == -1) {
|
||||
// before the first term, or empty terms dict
|
||||
if (entry.termsDictSize == 0) {
|
||||
ord = 0;
|
||||
return SeekStatus.END;
|
||||
} else {
|
||||
seekExact(0L);
|
||||
return SeekStatus.NOT_FOUND;
|
||||
}
|
||||
if (block == -2) {
|
||||
// empty terms dict
|
||||
assert entry.termsDictSize == 0;
|
||||
return SeekStatus.END;
|
||||
} else if (block == -1) {
|
||||
// before the first term
|
||||
return SeekStatus.NOT_FOUND;
|
||||
}
|
||||
final long blockAddress = blockAddresses.get(block);
|
||||
this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
bytes.seek(blockAddress);
|
||||
decompressBlock();
|
||||
|
||||
while (true) {
|
||||
int cmp = term.compareTo(text);
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.LongSupplier;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -958,4 +959,61 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
// Testing termsEnum seekCeil edge case, where inconsistent internal state led to
|
||||
// IndexOutOfBoundsException
|
||||
// see https://github.com/apache/lucene/pull/12555 for details
|
||||
public void testTermsEnumConsistency() throws IOException {
|
||||
int numTerms =
|
||||
Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE
|
||||
+ 10; // need more than one block of unique terms.
|
||||
Directory directory = newDirectory();
|
||||
IndexWriterConfig conf = newIndexWriterConfig();
|
||||
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
|
||||
Document doc = new Document();
|
||||
|
||||
// for simplicity, we will generate sorted list of terms which are a) unique b) all greater than
|
||||
// the term that we want to use for the test
|
||||
char termA = 'A';
|
||||
Function<Integer, String> stringSupplier =
|
||||
(Integer n) -> {
|
||||
assert n < 25 * 25;
|
||||
char[] chars = new char[] {(char) (termA + 1 + n / 25), (char) (termA + 1 + n % 25)};
|
||||
return new String(chars);
|
||||
};
|
||||
SortedDocValuesField field =
|
||||
new SortedDocValuesField("field", new BytesRef(stringSupplier.apply(0)));
|
||||
doc.add(field);
|
||||
iwriter.addDocument(doc);
|
||||
for (int i = 1; i < numTerms; i++) {
|
||||
field.setBytesValue(new BytesRef(stringSupplier.apply(i)));
|
||||
iwriter.addDocument(doc);
|
||||
}
|
||||
// merging to one segment to make sure we have more than one block (TERMS_DICT_BLOCK_LZ4_SIZE)
|
||||
// in a segment, to trigger next block decompression.
|
||||
iwriter.forceMerge(1);
|
||||
iwriter.close();
|
||||
|
||||
IndexReader reader = DirectoryReader.open(directory);
|
||||
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||
SortedDocValues values = leafReader.getSortedDocValues("field");
|
||||
TermsEnum termsEnum = values.termsEnum();
|
||||
|
||||
// Position terms enum at 0
|
||||
termsEnum.seekExact(0L);
|
||||
assertEquals(0, termsEnum.ord());
|
||||
// seekCeil to a term which doesn't exist in the index
|
||||
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("A")));
|
||||
// ... and before any other term in the index
|
||||
assertEquals(0, termsEnum.ord());
|
||||
|
||||
assertEquals(new BytesRef(stringSupplier.apply(0)), termsEnum.term());
|
||||
// read more than one block of terms to trigger next block decompression
|
||||
for (int i = 1; i < numTerms; i++) {
|
||||
assertEquals(new BytesRef(stringSupplier.apply(i)), termsEnum.next());
|
||||
}
|
||||
assertNull(termsEnum.next());
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue