From c82b99464dae6379380f214f250592a450bbe23b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 22 Sep 2020 15:10:53 +0200 Subject: [PATCH] LUCENE-9539: Use more compact datastructures for sorting doc-values (#1908) This change cuts over from object based data-structures to primitive / compressed data-structures. --- lucene/CHANGES.txt | 3 + .../lucene/index/BinaryDocValuesWriter.java | 64 +++++------- .../index/SortedNumericDocValuesWriter.java | 82 +++++++++------- .../index/SortedSetDocValuesWriter.java | 98 +++++++++---------- .../lucene/index/SortingCodecReader.java | 16 +-- 5 files changed, 135 insertions(+), 128 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a4278ab517d..7389abfac86 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -238,6 +238,9 @@ Improvements * LUCENE-9523: In query shapes over shape fields, skip points while traversing the BKD tree when the relationship with the document is already known. (Ignacio Vera) +* LUCENE-9539: Use more compact datastructures to represent sorted doc-values in memory when + sorting a segment before flush and in SortingCodecReader. (Simon Willnauer) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 775f32508a0..785adff5d3c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -24,11 +24,10 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -58,7 +57,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { private PackedLongValues finalLengths; - public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; this.bytes = new PagedBytes(BLOCK_BITS); this.bytesOut = bytes.getDataOutput(); @@ -100,21 +99,6 @@ class BinaryDocValuesWriter extends DocValuesWriter { bytesUsed = newBytesUsed; } - static CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException { - FixedBitSet docsWithField = new FixedBitSet(maxDoc); - BytesRef[] values = new BytesRef[maxDoc]; - while (true) { - int docID = oldValues.nextDoc(); - if (docID == NO_MORE_DOCS) { - break; - } - int newDocID = sortMap.oldToNew(docID); - docsWithField.set(newDocID); - values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue()); - } - return new CachedBinaryDVs(values, docsWithField); - } - @Override BinaryDocValues getDocValues() { if (finalLengths == null) { @@ -131,7 +115,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { } final CachedBinaryDVs sorted; if (sortMap != null) { - sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, + sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap, new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator())); } else { sorted = null; @@ -206,8 +190,8 @@ class BinaryDocValuesWriter extends DocValuesWriter { static class SortingBinaryDocValues extends BinaryDocValues { private final CachedBinaryDVs dvs; + private final BytesRefBuilder spare = new BytesRefBuilder(); private int docID = -1; - private long cost = -1; SortingBinaryDocValues(CachedBinaryDVs dvs) { this.dvs = dvs; @@ -215,11 +199,12 @@ class BinaryDocValuesWriter extends DocValuesWriter { @Override public int nextDoc() { - if (docID+1 == dvs.docsWithField.length()) { - docID = NO_MORE_DOCS; - } else { - docID = dvs.docsWithField.nextSetBit(docID+1); - } + do { + docID++; + if (docID == dvs.offsets.length) { + return docID = NO_MORE_DOCS; + } + } while (dvs.offsets[docID] <= 0); return docID; } @@ -240,26 +225,29 @@ class BinaryDocValuesWriter extends DocValuesWriter { @Override public BytesRef binaryValue() { - return dvs.values[docID]; + dvs.values.get(spare, dvs.offsets[docID]-1); + return spare.get(); } @Override public long cost() { - if (cost == -1) { - cost = dvs.docsWithField.cardinality(); - } - return cost; + return dvs.values.size(); } } - static class CachedBinaryDVs { - // TODO: at least cutover to BytesRefArray here: - private final BytesRef[] values; - private final BitSet docsWithField; - - CachedBinaryDVs(BytesRef[] values, BitSet docsWithField) { - this.values = values; - this.docsWithField = docsWithField; + static final class CachedBinaryDVs { + final int[] offsets; + final BytesRefArray values; + CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException { + offsets = new int[maxDoc]; + values = new BytesRefArray(Counter.newCounter()); + int offset = 1; // 0 means no values for this document + int docID; + while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) { + int newDocID = sortMap.oldToNew(docID); + values.append(oldValues.binaryValue()); + offsets[newDocID] = offset++; + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java index 0acf93b2014..e75627805b4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java @@ -39,13 +39,13 @@ class SortedNumericDocValuesWriter extends DocValuesWriter= values.offsets.length) { + return docID = NO_MORE_DOCS; } - if (values[docID] != null) { - break; - } - // skip missing docs - } - upto = 0; + } while (values.offsets[docID] <= 0); + upto = values.offsets[docID]; + numValues = Math.toIntExact(values.values.get(upto-1)); + limit = upto + numValues; return docID; } @@ -264,16 +273,23 @@ class SortedNumericDocValuesWriter extends DocValuesWriter 0) { + numValues = Math.toIntExact(values.values.get(upto-1)); + limit = upto + numValues; + return true; + } else { + limit = upto; + } + return false; } @Override public long nextValue() { - if (upto == values[docID].length) { + if (upto == limit) { throw new AssertionError(); } else { - return values[docID][upto++]; + return values.values.get(upto++); } } @@ -284,7 +300,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private final FieldInfo fieldInfo; private int currentDoc = -1; - private int currentValues[] = new int[8]; + private int[] currentValues = new int[8]; private int currentUpto; private int maxCount; @@ -55,7 +55,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { private int[] finalOrdMap; - public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; hash = new BytesRefHash( @@ -139,28 +139,6 @@ class SortedSetDocValuesWriter extends DocValuesWriter { bytesUsed = newBytesUsed; } - static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues) throws IOException { - long[][] ords = new long[maxDoc][]; - int docID; - while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) { - int newDocID = sortMap.oldToNew(docID); - long[] docOrds = new long[1]; - int upto = 0; - while (true) { - long ord = oldValues.nextOrd(); - if (ord == NO_MORE_ORDS) { - break; - } - if (upto == docOrds.length) { - docOrds = ArrayUtil.grow(docOrds); - } - docOrds[upto++] = ord; - } - ords[newDocID] = ArrayUtil.copyOfSubArray(docOrds, 0, upto); - } - return ords; - } - @Override SortedSetDocValues getDocValues() { if (finalOrds == null) { @@ -203,12 +181,12 @@ class SortedSetDocValuesWriter extends DocValuesWriter { ordMap = finalOrdMap; } - final long[][] sorted; + final DocOrds docOrds; if (sortMap != null) { - sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, - new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator())); + docOrds = new DocOrds(state.segmentInfo.maxDoc(), sortMap, + new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()), PackedInts.FASTEST); } else { - sorted = null; + docOrds = null; } dvConsumer.addSortedSetField(fieldInfo, new EmptyDocValuesProducer() { @@ -219,10 +197,10 @@ class SortedSetDocValuesWriter extends DocValuesWriter { } final SortedSetDocValues buf = new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()); - if (sorted == null) { + if (docOrds == null) { return buf; } else { - return new SortingSortedSetDocValues(buf, sorted); + return new SortingSortedSetDocValues(buf, docOrds); } } }); @@ -236,7 +214,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { final PackedLongValues.Iterator ordsIter; final PackedLongValues.Iterator ordCountsIter; final DocIdSetIterator docsWithField; - final int currentDoc[]; + final int[] currentDoc; private int ordCount; private int ordUpto; @@ -311,11 +289,11 @@ class SortedSetDocValuesWriter extends DocValuesWriter { static class SortingSortedSetDocValues extends SortedSetDocValues { private final SortedSetDocValues in; - private final long[][] ords; + private final DocOrds ords; private int docID = -1; - private int ordUpto; + private long ordUpto; - SortingSortedSetDocValues(SortedSetDocValues in, long[][] ords) { + SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) { this.in = in; this.ords = ords; } @@ -327,18 +305,13 @@ class SortedSetDocValuesWriter extends DocValuesWriter { @Override public int nextDoc() { - while (true) { + do { docID++; - if (docID == ords.length) { - docID = NO_MORE_DOCS; - break; + if (docID == ords.offsets.length) { + return docID = NO_MORE_DOCS; } - if (ords[docID] != null) { - break; - } - // skip missing docs - } - ordUpto = 0; + } while (ords.offsets[docID] <= 0); + ordUpto = ords.offsets[docID]-1; return docID; } @@ -351,16 +324,16 @@ class SortedSetDocValuesWriter extends DocValuesWriter { public boolean advanceExact(int target) throws IOException { // needed in IndexSorter#StringSorter docID = target; - ordUpto = 0; - return ords[docID] != null; + ordUpto = ords.offsets[docID]-1; + return ords.offsets[docID] > 0; } - @Override public long nextOrd() { - if (ordUpto == ords[docID].length) { + long ord = ords.ords.get(ordUpto++); + if (ord == 0) { return NO_MORE_ORDS; } else { - return ords[docID][ordUpto++]; + return ord - 1 ; } } @@ -379,4 +352,31 @@ class SortedSetDocValuesWriter extends DocValuesWriter { return in.getValueCount(); } } + + static final class DocOrds { + final long[] offsets; + final PackedLongValues ords; + + DocOrds(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues, float acceptableOverheadRatio) throws IOException { + offsets = new long[maxDoc]; + PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio); + long ordOffset = 1; // 0 marks docs with no values + int docID; + while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) { + int newDocID = sortMap.oldToNew(docID); + long startOffset = ordOffset; + long ord; + while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) { + builder.add(ord + 1); + ordOffset++; + } + if (startOffset != ordOffset) { // do we have any values? + offsets[newDocID] = startOffset; + builder.add(0); // 0 ord marks next value + ordOffset++; + } + } + ords = builder.build(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index cc7739c3407..8eb08c596ab 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -32,6 +32,7 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.packed.PackedInts; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -51,10 +52,9 @@ public final class SortingCodecReader extends FilterCodecReader { private final Map cachedSortedDVs = new HashMap<>(); - // TODO: pack long[][] into an int[] (offset) and long[] instead: - private final Map cachedSortedSetDVs = new HashMap<>(); + private final Map cachedSortedSetDVs = new HashMap<>(); - private final Map cachedSortedNumericDVs = new HashMap<>(); + private final Map cachedSortedNumericDVs = new HashMap<>(); private static class SortingBits implements Bits { @@ -381,7 +381,7 @@ public final class SortingCodecReader extends FilterCodecReader { synchronized (cachedBinaryDVs) { dvs = cachedBinaryDVs.get(field); if (dvs == null) { - dvs = BinaryDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues); + dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues); cachedBinaryDVs.put(field.name, dvs); } } @@ -412,11 +412,11 @@ public final class SortingCodecReader extends FilterCodecReader { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field); - long[][] values; + SortedNumericDocValuesWriter.LongValues values; synchronized (cachedSortedNumericDVs) { values = cachedSortedNumericDVs.get(field); if (values == null) { - values = SortedNumericDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues); + values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST); cachedSortedNumericDVs.put(field.name, values); } } @@ -427,11 +427,11 @@ public final class SortingCodecReader extends FilterCodecReader { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { SortedSetDocValues oldDocValues = delegate.getSortedSet(field); - long[][] ords; + SortedSetDocValuesWriter.DocOrds ords; synchronized (cachedSortedSetDVs) { ords = cachedSortedSetDVs.get(field); if (ords == null) { - ords = SortedSetDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues); + ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST); cachedSortedSetDVs.put(field.name, ords); } }