diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java index 819c34e097e..494d0edc3bc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java @@ -804,12 +804,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { public int nextDoc() throws IOException { int doc = values.nextDoc(); if (doc != NO_MORE_DOCS) { - docValueCount = 0; - for (long ord = values.nextOrd(); - ord != SortedSetDocValues.NO_MORE_ORDS; - ord = values.nextOrd()) { - ords = ArrayUtil.grow(ords, docValueCount + 1); - ords[docValueCount++] = ord; + docValueCount = values.docValueCount(); + ords = ArrayUtil.grow(ords, docValueCount); + for (int j = 0; j < docValueCount; j++) { + ords[j] = values.nextOrd(); } i = 0; } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java index 6ebe69c62b3..fedc0bf6b3b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java @@ -32,6 +32,7 @@ import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.Counter; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -228,7 +229,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter { state.segmentInfo.maxDoc(), sortMap, getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField), - PackedInts.FASTEST); + PackedInts.FASTEST, + PackedInts.bitsRequired(maxCount)); } else { docOrds = null; } @@ -350,6 +352,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter { private final DocOrds ords; private int docID = -1; private long ordUpto; + private long limit; + private int count; SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) { this.in = in; @@ -369,7 +373,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { return docID = NO_MORE_DOCS; } } while (ords.offsets[docID] <= 0); - ordUpto = ords.offsets[docID] - 1; + initCount(); return docID; } @@ -382,23 +386,23 @@ class SortedSetDocValuesWriter extends DocValuesWriter { public boolean advanceExact(int target) throws IOException { // needed in IndexSorter#StringSorter docID = target; - ordUpto = ords.offsets[docID] - 1; + initCount(); return ords.offsets[docID] > 0; } @Override public long nextOrd() { - long ord = ords.ords.get(ordUpto++); - if (ord == 0) { + if (limit == ordUpto) { return NO_MORE_ORDS; } else { - return ord - 1; + return ords.ords.get(ordUpto++); } } @Override public int docValueCount() { - return (int) ords.ords.size(); + assert docID >= 0; + return count; } @Override @@ -415,34 +419,45 @@ class SortedSetDocValuesWriter extends DocValuesWriter { public long getValueCount() { return in.getValueCount(); } + + private void initCount() { + assert docID >= 0; + ordUpto = ords.offsets[docID] - 1; + count = (int) ords.docValueCounts.get(docID); + limit = ordUpto + count; + } } static final class DocOrds { final long[] offsets; final PackedLongValues ords; + final GrowableWriter docValueCounts; + + public static final int START_BITS_PER_VALUE = 2; DocOrds( int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues, - float acceptableOverheadRatio) + float acceptableOverheadRatio, + int bitsPerValue) throws IOException { offsets = new long[maxDoc]; PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio); - long ordOffset = 1; // 0 marks docs with no values + docValueCounts = new GrowableWriter(bitsPerValue, maxDoc, acceptableOverheadRatio); + long ordOffset = 1; int docID; while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) { int newDocID = sortMap.oldToNew(docID); long startOffset = ordOffset; long ord; while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) { - builder.add(ord + 1); + builder.add(ord); ordOffset++; } + docValueCounts.set(newDocID, ordOffset - startOffset); if (startOffset != ordOffset) { // do we have any values? offsets[newDocID] = startOffset; - builder.add(0); // 0 ord marks next value - ordOffset++; } } ords = builder.build(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 0c0f6c84896..f877ce58059 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -483,7 +483,11 @@ public final class SortingCodecReader extends FilterCodecReader { field.name, () -> new SortedSetDocValuesWriter.DocOrds( - maxDoc(), docMap, oldDocValues, PackedInts.FAST))); + maxDoc(), + docMap, + oldDocValues, + PackedInts.FAST, + SortedSetDocValuesWriter.DocOrds.START_BITS_PER_VALUE))); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 42a898d87b4..db9c7cf1f51 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -43,6 +43,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; @@ -55,6 +56,45 @@ import org.apache.lucene.util.IOUtils; public class TestSortingCodecReader extends LuceneTestCase { + public void testSortOnAddIndicesOrd() throws IOException { + Directory tmpDir = newDirectory(); + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w = new IndexWriter(tmpDir, iwc); + + Document doc; + doc = new Document(); + doc.add(new SortedSetDocValuesField("foo", new BytesRef("b"))); + w.addDocument(doc); + + doc.add(new SortedSetDocValuesField("foo", new BytesRef("a"))); + doc.add(new SortedSetDocValuesField("foo", new BytesRef("b"))); + doc.add(new SortedSetDocValuesField("foo", new BytesRef("b"))); + w.addDocument(doc); + + w.commit(); + + Sort indexSort = new Sort(new SortedSetSortField("foo", false, SortedSetSelector.Type.MIN)); + try (DirectoryReader reader = DirectoryReader.open(tmpDir)) { + for (LeafReaderContext ctx : reader.leaves()) { + CodecReader wrap = + SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort); + assertTrue(wrap.toString(), wrap.toString().startsWith("SortingCodecReader(")); + SortingCodecReader sortingCodecReader = (SortingCodecReader) wrap; + SortedSetDocValues sortedSetDocValues = + sortingCodecReader + .getDocValuesReader() + .getSortedSet(ctx.reader().getFieldInfos().fieldInfo("foo")); + sortedSetDocValues.nextDoc(); + assertEquals(sortedSetDocValues.docValueCount(), 2); + sortedSetDocValues.nextDoc(); + assertEquals(sortedSetDocValues.docValueCount(), 1); + assertEquals(sortedSetDocValues.nextDoc(), DocIdSetIterator.NO_MORE_DOCS); + } + } + IOUtils.close(w, dir, tmpDir); + } + public void testSortOnAddIndicesInt() throws IOException { Directory tmpDir = newDirectory(); Directory dir = newDirectory(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java index a32f2ac57e8..f33521fc10a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java @@ -189,11 +189,8 @@ public class AssertingDocValuesFormat extends DocValuesFormat { } long lastOrd = -1; - while (true) { + for (int i = 0; i < values.docValueCount(); i++) { long ord = values.nextOrd(); - if (ord == SortedSetDocValues.NO_MORE_ORDS) { - break; - } assert ord >= 0 && ord < valueCount : "ord=" + ord + " is not in bounds 0 .." + (valueCount - 1); assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;