LUCENE-10623: Error implementation of docValueCount for SortingSortedSetDocValues (#967)

This commit is contained in:
Lu Xugang 2022-06-28 11:49:47 +08:00 committed by GitHub
parent 7b58088bd5
commit d8fb47b674
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 77 additions and 23 deletions

View File

@ -804,12 +804,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
int doc = values.nextDoc(); int doc = values.nextDoc();
if (doc != NO_MORE_DOCS) { if (doc != NO_MORE_DOCS) {
docValueCount = 0; docValueCount = values.docValueCount();
for (long ord = values.nextOrd(); ords = ArrayUtil.grow(ords, docValueCount);
ord != SortedSetDocValues.NO_MORE_ORDS; for (int j = 0; j < docValueCount; j++) {
ord = values.nextOrd()) { ords[j] = values.nextOrd();
ords = ArrayUtil.grow(ords, docValueCount + 1);
ords[docValueCount++] = ord;
} }
i = 0; i = 0;
} }

View File

@ -32,6 +32,7 @@ import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.Counter; import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues; import org.apache.lucene.util.packed.PackedLongValues;
@ -228,7 +229,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
state.segmentInfo.maxDoc(), state.segmentInfo.maxDoc(),
sortMap, sortMap,
getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField), getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField),
PackedInts.FASTEST); PackedInts.FASTEST,
PackedInts.bitsRequired(maxCount));
} else { } else {
docOrds = null; docOrds = null;
} }
@ -350,6 +352,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private final DocOrds ords; private final DocOrds ords;
private int docID = -1; private int docID = -1;
private long ordUpto; private long ordUpto;
private long limit;
private int count;
SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) { SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) {
this.in = in; this.in = in;
@ -369,7 +373,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
return docID = NO_MORE_DOCS; return docID = NO_MORE_DOCS;
} }
} while (ords.offsets[docID] <= 0); } while (ords.offsets[docID] <= 0);
ordUpto = ords.offsets[docID] - 1; initCount();
return docID; return docID;
} }
@ -382,23 +386,23 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
public boolean advanceExact(int target) throws IOException { public boolean advanceExact(int target) throws IOException {
// needed in IndexSorter#StringSorter // needed in IndexSorter#StringSorter
docID = target; docID = target;
ordUpto = ords.offsets[docID] - 1; initCount();
return ords.offsets[docID] > 0; return ords.offsets[docID] > 0;
} }
@Override @Override
public long nextOrd() { public long nextOrd() {
long ord = ords.ords.get(ordUpto++); if (limit == ordUpto) {
if (ord == 0) {
return NO_MORE_ORDS; return NO_MORE_ORDS;
} else { } else {
return ord - 1; return ords.ords.get(ordUpto++);
} }
} }
@Override @Override
public int docValueCount() { public int docValueCount() {
return (int) ords.ords.size(); assert docID >= 0;
return count;
} }
@Override @Override
@ -415,34 +419,45 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
public long getValueCount() { public long getValueCount() {
return in.getValueCount(); return in.getValueCount();
} }
private void initCount() {
assert docID >= 0;
ordUpto = ords.offsets[docID] - 1;
count = (int) ords.docValueCounts.get(docID);
limit = ordUpto + count;
}
} }
static final class DocOrds { static final class DocOrds {
final long[] offsets; final long[] offsets;
final PackedLongValues ords; final PackedLongValues ords;
final GrowableWriter docValueCounts;
public static final int START_BITS_PER_VALUE = 2;
DocOrds( DocOrds(
int maxDoc, int maxDoc,
Sorter.DocMap sortMap, Sorter.DocMap sortMap,
SortedSetDocValues oldValues, SortedSetDocValues oldValues,
float acceptableOverheadRatio) float acceptableOverheadRatio,
int bitsPerValue)
throws IOException { throws IOException {
offsets = new long[maxDoc]; offsets = new long[maxDoc];
PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio); PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio);
long ordOffset = 1; // 0 marks docs with no values docValueCounts = new GrowableWriter(bitsPerValue, maxDoc, acceptableOverheadRatio);
long ordOffset = 1;
int docID; int docID;
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) { while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = sortMap.oldToNew(docID); int newDocID = sortMap.oldToNew(docID);
long startOffset = ordOffset; long startOffset = ordOffset;
long ord; long ord;
while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) { while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) {
builder.add(ord + 1); builder.add(ord);
ordOffset++; ordOffset++;
} }
docValueCounts.set(newDocID, ordOffset - startOffset);
if (startOffset != ordOffset) { // do we have any values? if (startOffset != ordOffset) { // do we have any values?
offsets[newDocID] = startOffset; offsets[newDocID] = startOffset;
builder.add(0); // 0 ord marks next value
ordOffset++;
} }
} }
ords = builder.build(); ords = builder.build();

View File

@ -483,7 +483,11 @@ public final class SortingCodecReader extends FilterCodecReader {
field.name, field.name,
() -> () ->
new SortedSetDocValuesWriter.DocOrds( new SortedSetDocValuesWriter.DocOrds(
maxDoc(), docMap, oldDocValues, PackedInts.FAST))); maxDoc(),
docMap,
oldDocValues,
PackedInts.FAST,
SortedSetDocValuesWriter.DocOrds.START_BITS_PER_VALUE)));
} }
@Override @Override

View File

@ -43,6 +43,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
@ -55,6 +56,45 @@ import org.apache.lucene.util.IOUtils;
public class TestSortingCodecReader extends LuceneTestCase { public class TestSortingCodecReader extends LuceneTestCase {
public void testSortOnAddIndicesOrd() throws IOException {
Directory tmpDir = newDirectory();
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w = new IndexWriter(tmpDir, iwc);
Document doc;
doc = new Document();
doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
w.addDocument(doc);
doc.add(new SortedSetDocValuesField("foo", new BytesRef("a")));
doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
w.addDocument(doc);
w.commit();
Sort indexSort = new Sort(new SortedSetSortField("foo", false, SortedSetSelector.Type.MIN));
try (DirectoryReader reader = DirectoryReader.open(tmpDir)) {
for (LeafReaderContext ctx : reader.leaves()) {
CodecReader wrap =
SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
assertTrue(wrap.toString(), wrap.toString().startsWith("SortingCodecReader("));
SortingCodecReader sortingCodecReader = (SortingCodecReader) wrap;
SortedSetDocValues sortedSetDocValues =
sortingCodecReader
.getDocValuesReader()
.getSortedSet(ctx.reader().getFieldInfos().fieldInfo("foo"));
sortedSetDocValues.nextDoc();
assertEquals(sortedSetDocValues.docValueCount(), 2);
sortedSetDocValues.nextDoc();
assertEquals(sortedSetDocValues.docValueCount(), 1);
assertEquals(sortedSetDocValues.nextDoc(), DocIdSetIterator.NO_MORE_DOCS);
}
}
IOUtils.close(w, dir, tmpDir);
}
public void testSortOnAddIndicesInt() throws IOException { public void testSortOnAddIndicesInt() throws IOException {
Directory tmpDir = newDirectory(); Directory tmpDir = newDirectory();
Directory dir = newDirectory(); Directory dir = newDirectory();

View File

@ -189,11 +189,8 @@ public class AssertingDocValuesFormat extends DocValuesFormat {
} }
long lastOrd = -1; long lastOrd = -1;
while (true) { for (int i = 0; i < values.docValueCount(); i++) {
long ord = values.nextOrd(); long ord = values.nextOrd();
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
break;
}
assert ord >= 0 && ord < valueCount assert ord >= 0 && ord < valueCount
: "ord=" + ord + " is not in bounds 0 .." + (valueCount - 1); : "ord=" + ord + " is not in bounds 0 .." + (valueCount - 1);
assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd; assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;