From 17c285d61743da0c06735e06235b20bd5aac4e14 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 23 Sep 2020 14:21:28 +0200 Subject: [PATCH] LUCENE-9539: Remove caches from SortingCodecReader (#1909) SortingCodecReader keeps all docvalues in memory that are loaded from this reader. Yet, this reader should only be used for merging which happens sequentially. This makes caching docvalues unnecessary. Co-authored-by: Jim Ferenczi --- lucene/CHANGES.txt | 7 + .../lucene/index/BinaryDocValuesWriter.java | 12 +- .../apache/lucene/index/NormValuesWriter.java | 2 +- .../lucene/index/NumericDocValuesWriter.java | 14 +- .../lucene/index/SortingCodecReader.java | 166 +++++++++--------- .../lucene/index/TestSortingCodecReader.java | 8 +- 6 files changed, 107 insertions(+), 102 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7389abfac86..8594ea0cdc6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -274,6 +274,13 @@ Documentation * LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai) +Changes in Runtime Behavior +--------------------- + +* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader + used to cache all doc values fields after they were loaded into memory. This reader should only be used + to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer) + Other --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 785adff5d3c..fe9cfaf24bd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -113,9 +113,9 @@ class BinaryDocValuesWriter extends DocValuesWriter { if (finalLengths == null) { finalLengths = this.lengths.build(); } - final CachedBinaryDVs sorted; + final BinaryDVs sorted; if (sortMap != null) { - sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap, + sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap, new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator())); } else { sorted = null; @@ -189,11 +189,11 @@ class BinaryDocValuesWriter extends DocValuesWriter { } static class SortingBinaryDocValues extends BinaryDocValues { - private final CachedBinaryDVs dvs; + private final BinaryDVs dvs; private final BytesRefBuilder spare = new BytesRefBuilder(); private int docID = -1; - SortingBinaryDocValues(CachedBinaryDVs dvs) { + SortingBinaryDocValues(BinaryDVs dvs) { this.dvs = dvs; } @@ -235,10 +235,10 @@ class BinaryDocValuesWriter extends DocValuesWriter { } } - static final class CachedBinaryDVs { + static final class BinaryDVs { final int[] offsets; final BytesRefArray values; - CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException { + BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException { offsets = new int[maxDoc]; values = new BytesRefArray(Counter.newCounter()); int offset = 1; // 0 means no values for this document diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java index 29643524c00..1039880284e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java @@ -70,7 +70,7 @@ class NormValuesWriter { public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException { final PackedLongValues values = pending.build(); - final NumericDocValuesWriter.CachedNumericDVs sorted; + final NumericDocValuesWriter.NumericDVs sorted; if (sortMap != null) { sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap, new BufferedNorms(values, docsWithField.iterator())); diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index a696bcaf185..fcd05274b5b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -77,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter { return new BufferedNumericDocValues(finalValues, docsWithField.iterator()); } - static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException { + static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException { FixedBitSet docsWithField = new FixedBitSet(maxDoc); long[] values = new long[maxDoc]; while (true) { @@ -89,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter { docsWithField.set(newDocID); values[newDocID] = oldDocValues.longValue(); } - return new CachedNumericDVs(values, docsWithField); + return new NumericDVs(values, docsWithField); } @Override @@ -97,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter { if (finalValues == null) { finalValues = pending.build(); } - final CachedNumericDVs sorted; + final NumericDVs sorted; if (sortMap != null) { NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator()); sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues); @@ -169,11 +169,11 @@ class NumericDocValuesWriter extends DocValuesWriter { static class SortingNumericDocValues extends NumericDocValues { - private final CachedNumericDVs dvs; + private final NumericDVs dvs; private int docID = -1; private long cost = -1; - SortingNumericDocValues(CachedNumericDVs dvs) { + SortingNumericDocValues(NumericDVs dvs) { this.dvs = dvs; } @@ -218,11 +218,11 @@ class NumericDocValuesWriter extends DocValuesWriter { } } - static class CachedNumericDVs { + static class NumericDVs { private final long[] values; private final BitSet docsWithField; - CachedNumericDVs(long[] values, BitSet docsWithField) { + NumericDVs(long[] values, BitSet docsWithField) { this.values = values; this.docsWithField = docsWithField; } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 8eb08c596ab..f5831fd932d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -30,8 +30,10 @@ import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.packed.PackedInts; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -41,21 +43,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; * {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all * readers of the index with this reader and adding it to a fresh IndexWriter via * {@link IndexWriter#addIndexes(CodecReader...)}. + * NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory + * intensive. * * @lucene.experimental */ public final class SortingCodecReader extends FilterCodecReader { - private final Map cachedNumericDVs = new HashMap<>(); - - private final Map cachedBinaryDVs = new HashMap<>(); - - private final Map cachedSortedDVs = new HashMap<>(); - - private final Map cachedSortedSetDVs = new HashMap<>(); - - private final Map cachedSortedNumericDVs = new HashMap<>(); - private static class SortingBits implements Bits { private final Bits in; @@ -148,10 +142,6 @@ public final class SortingCodecReader extends FilterCodecReader { } } - - - - /** Return a sorted view of reader according to the order * defined by sort. If the reader is already sorted, this * method might return the reader as-is. */ @@ -313,15 +303,13 @@ public final class SortingCodecReader extends FilterCodecReader { }; } - private final Map cachedNorms = new HashMap<>(); - @Override public NormsProducer getNormsReader() { final NormsProducer delegate = in.getNormsReader(); return new NormsProducer() { @Override public NumericDocValues getNorms(FieldInfo field) throws IOException { - return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms); + return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field)))); } @Override @@ -341,101 +329,48 @@ public final class SortingCodecReader extends FilterCodecReader { }; } - private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms, - Map cachedNorms) throws IOException { - NumericDocValuesWriter.CachedNumericDVs norms; - synchronized (cachedNorms) { - norms = cachedNorms.get(field); - if (norms == null) { - FixedBitSet docsWithField = new FixedBitSet(maxDoc()); - long[] values = new long[maxDoc()]; - while (true) { - int docID = oldNorms.nextDoc(); - if (docID == NO_MORE_DOCS) { - break; - } - int newDocID = docMap.oldToNew(docID); - docsWithField.set(newDocID); - values[newDocID] = oldNorms.longValue(); - } - norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField); - cachedNorms.put(field.name, norms); - } - } - return new NumericDocValuesWriter.SortingNumericDocValues(norms); - } - @Override public DocValuesProducer getDocValuesReader() { final DocValuesProducer delegate = in.getDocValuesReader(); return new DocValuesProducer() { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs); + return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field)))); } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - final BinaryDocValues oldDocValues = delegate.getBinary(field); - BinaryDocValuesWriter.CachedBinaryDVs dvs; - synchronized (cachedBinaryDVs) { - dvs = cachedBinaryDVs.get(field); - if (dvs == null) { - dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues); - cachedBinaryDVs.put(field.name, dvs); - } - } - return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs); + return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name, + () -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field)))); } @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { SortedDocValues oldDocValues = delegate.getSorted(field); - int[] ords; - synchronized (cachedSortedDVs) { - ords = cachedSortedDVs.get(field); - if (ords == null) { - ords = new int[maxDoc()]; - Arrays.fill(ords, -1); - int docID; - while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) { - int newDocID = docMap.oldToNew(docID); - ords[newDocID] = oldDocValues.ordValue(); - } - cachedSortedDVs.put(field.name, ords); + return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> { + int[] ords = new int[maxDoc()]; + Arrays.fill(ords, -1); + int docID; + while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) { + int newDocID = docMap.oldToNew(docID); + ords[newDocID] = oldDocValues.ordValue(); } - } - - return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords); + return ords; + })); } @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field); - SortedNumericDocValuesWriter.LongValues values; - synchronized (cachedSortedNumericDVs) { - values = cachedSortedNumericDVs.get(field); - if (values == null) { - values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST); - cachedSortedNumericDVs.put(field.name, values); - } - } - - return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values); + return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () -> + new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST))); } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { SortedSetDocValues oldDocValues = delegate.getSortedSet(field); - SortedSetDocValuesWriter.DocOrds ords; - synchronized (cachedSortedSetDVs) { - ords = cachedSortedSetDVs.get(field); - if (ords == null) { - ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST); - cachedSortedSetDVs.put(field.name, ords); - } - } - return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords); + return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () -> + new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST))); } @Override @@ -455,6 +390,18 @@ public final class SortingCodecReader extends FilterCodecReader { }; } + private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException { + FixedBitSet docsWithField = new FixedBitSet(maxDoc()); + long[] values = new long[maxDoc()]; + int docID; + while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) { + int newDocID = docMap.oldToNew(docID); + docsWithField.set(newDocID); + values[newDocID] = oldNumerics.longValue(); + } + return new NumericDocValuesWriter.NumericDVs(values, docsWithField); + } + @Override public TermVectorsReader getTermVectorsReader() { return newTermVectorsReader(in.getTermVectorsReader()); @@ -510,4 +457,51 @@ public final class SortingCodecReader extends FilterCodecReader { return metaData; } + // we try to cache the last used DV or Norms instance since during merge + // this instance is used more than once. We could in addition to this single instance + // also cache the fields that are used for sorting since we do the work twice for these fields + private String cachedField; + private Object cachedObject; + private boolean cacheIsNorms; + + private T getOrCreateNorms(String field, IOSupplier supplier) throws IOException { + return getOrCreate(field, true, supplier); + } + + @SuppressWarnings("unchecked") + private synchronized T getOrCreate(String field, boolean norms, IOSupplier supplier) throws IOException { + if ((field.equals(cachedField) && cacheIsNorms == norms) == false) { + assert assertCreatedOnlyOnce(field, norms); + cachedObject = supplier.get(); + cachedField = field; + cacheIsNorms = norms; + } + assert cachedObject != null; + return (T) cachedObject; + } + + private final Map cacheStats = new HashMap<>(); // only with assertions enabled + private boolean assertCreatedOnlyOnce(String field, boolean norms) { + assert Thread.holdsLock(this); + // this is mainly there to make sure we change anything in the way we merge we realize it early + Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1); + if (timesCached > 1) { + assert norms == false :"[" + field + "] norms must not be cached twice"; + boolean isSortField = false; + for (SortField sf : metaData.getSort().getSort()) { + if (field.equals(sf.getField())) { + isSortField = true; + break; + } + } + assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: " + + timesCached + " times isSortField: " + isSortField; + assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field"; + } + return true; + } + + private T getOrCreateDV(String field, IOSupplier supplier) throws IOException { + return getOrCreate(field, false, supplier); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index c1ac197f571..fb540481f3b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -115,14 +115,18 @@ public class TestSortingCodecReader extends LuceneTestCase { for (int i = 0; i < numDocs; i++) { int docId = docIds.get(i); Document doc = new Document(); - doc.add(new NumericDocValuesField("foo", random().nextInt(20))); doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES)); doc.add(new LongPoint("id", docId)); - doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES)); + String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25); + doc.add(new TextField("text_field", s, Field.Store.YES)); + doc.add(new BinaryDocValuesField("text_field", new BytesRef(s))); + doc.add(new TextField("another_text_field", s, Field.Store.YES)); + doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s))); doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId)); doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId)))); doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId)))); doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId)))); + doc.add(new NumericDocValuesField("foo", random().nextInt(20))); FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); ft.setStoreTermVectors(true);