mirror of https://github.com/apache/lucene.git
LUCENE-9539: Remove caches from SortingCodecReader (#1909)
SortingCodecReader keeps all docvalues in memory that are loaded from this reader. Yet, this reader should only be used for merging which happens sequentially. This makes caching docvalues unnecessary. Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
This commit is contained in:
parent
fd0c08615d
commit
17c285d617
|
@ -274,6 +274,13 @@ Documentation
|
|||
|
||||
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
---------------------
|
||||
|
||||
* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
|
||||
used to cache all doc values fields after they were loaded into memory. This reader should only be used
|
||||
to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer)
|
||||
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
|
|
@ -113,9 +113,9 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
|
|||
if (finalLengths == null) {
|
||||
finalLengths = this.lengths.build();
|
||||
}
|
||||
final CachedBinaryDVs sorted;
|
||||
final BinaryDVs sorted;
|
||||
if (sortMap != null) {
|
||||
sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
|
||||
sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
|
||||
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
|
||||
} else {
|
||||
sorted = null;
|
||||
|
@ -189,11 +189,11 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
|
|||
}
|
||||
|
||||
static class SortingBinaryDocValues extends BinaryDocValues {
|
||||
private final CachedBinaryDVs dvs;
|
||||
private final BinaryDVs dvs;
|
||||
private final BytesRefBuilder spare = new BytesRefBuilder();
|
||||
private int docID = -1;
|
||||
|
||||
SortingBinaryDocValues(CachedBinaryDVs dvs) {
|
||||
SortingBinaryDocValues(BinaryDVs dvs) {
|
||||
this.dvs = dvs;
|
||||
}
|
||||
|
||||
|
@ -235,10 +235,10 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
|
|||
}
|
||||
}
|
||||
|
||||
static final class CachedBinaryDVs {
|
||||
static final class BinaryDVs {
|
||||
final int[] offsets;
|
||||
final BytesRefArray values;
|
||||
CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
|
||||
BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
|
||||
offsets = new int[maxDoc];
|
||||
values = new BytesRefArray(Counter.newCounter());
|
||||
int offset = 1; // 0 means no values for this document
|
||||
|
|
|
@ -70,7 +70,7 @@ class NormValuesWriter {
|
|||
|
||||
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
|
||||
final PackedLongValues values = pending.build();
|
||||
final NumericDocValuesWriter.CachedNumericDVs sorted;
|
||||
final NumericDocValuesWriter.NumericDVs sorted;
|
||||
if (sortMap != null) {
|
||||
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
|
||||
new BufferedNorms(values, docsWithField.iterator()));
|
||||
|
|
|
@ -77,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
|
|||
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
|
||||
}
|
||||
|
||||
static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
|
||||
static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
|
||||
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
|
||||
long[] values = new long[maxDoc];
|
||||
while (true) {
|
||||
|
@ -89,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
|
|||
docsWithField.set(newDocID);
|
||||
values[newDocID] = oldDocValues.longValue();
|
||||
}
|
||||
return new CachedNumericDVs(values, docsWithField);
|
||||
return new NumericDVs(values, docsWithField);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -97,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
|
|||
if (finalValues == null) {
|
||||
finalValues = pending.build();
|
||||
}
|
||||
final CachedNumericDVs sorted;
|
||||
final NumericDVs sorted;
|
||||
if (sortMap != null) {
|
||||
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
|
||||
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
|
||||
|
@ -169,11 +169,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
|
|||
|
||||
static class SortingNumericDocValues extends NumericDocValues {
|
||||
|
||||
private final CachedNumericDVs dvs;
|
||||
private final NumericDVs dvs;
|
||||
private int docID = -1;
|
||||
private long cost = -1;
|
||||
|
||||
SortingNumericDocValues(CachedNumericDVs dvs) {
|
||||
SortingNumericDocValues(NumericDVs dvs) {
|
||||
this.dvs = dvs;
|
||||
}
|
||||
|
||||
|
@ -218,11 +218,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
|
|||
}
|
||||
}
|
||||
|
||||
static class CachedNumericDVs {
|
||||
static class NumericDVs {
|
||||
private final long[] values;
|
||||
private final BitSet docsWithField;
|
||||
|
||||
CachedNumericDVs(long[] values, BitSet docsWithField) {
|
||||
NumericDVs(long[] values, BitSet docsWithField) {
|
||||
this.values = values;
|
||||
this.docsWithField = docsWithField;
|
||||
}
|
||||
|
|
|
@ -30,8 +30,10 @@ import org.apache.lucene.codecs.PointsReader;
|
|||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
@ -41,21 +43,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
|
||||
* readers of the index with this reader and adding it to a fresh IndexWriter via
|
||||
* {@link IndexWriter#addIndexes(CodecReader...)}.
|
||||
* NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
|
||||
* intensive.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class SortingCodecReader extends FilterCodecReader {
|
||||
|
||||
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
|
||||
|
||||
private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
|
||||
|
||||
private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
|
||||
|
||||
private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
|
||||
|
||||
private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
|
||||
|
||||
private static class SortingBits implements Bits {
|
||||
|
||||
private final Bits in;
|
||||
|
@ -148,10 +142,6 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/** Return a sorted view of <code>reader</code> according to the order
|
||||
* defined by <code>sort</code>. If the reader is already sorted, this
|
||||
* method might return the reader as-is. */
|
||||
|
@ -313,15 +303,13 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
};
|
||||
}
|
||||
|
||||
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public NormsProducer getNormsReader() {
|
||||
final NormsProducer delegate = in.getNormsReader();
|
||||
return new NormsProducer() {
|
||||
@Override
|
||||
public NumericDocValues getNorms(FieldInfo field) throws IOException {
|
||||
return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
|
||||
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -341,101 +329,48 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
};
|
||||
}
|
||||
|
||||
private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
|
||||
Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
|
||||
NumericDocValuesWriter.CachedNumericDVs norms;
|
||||
synchronized (cachedNorms) {
|
||||
norms = cachedNorms.get(field);
|
||||
if (norms == null) {
|
||||
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
|
||||
long[] values = new long[maxDoc()];
|
||||
while (true) {
|
||||
int docID = oldNorms.nextDoc();
|
||||
if (docID == NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
int newDocID = docMap.oldToNew(docID);
|
||||
docsWithField.set(newDocID);
|
||||
values[newDocID] = oldNorms.longValue();
|
||||
}
|
||||
norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
|
||||
cachedNorms.put(field.name, norms);
|
||||
}
|
||||
}
|
||||
return new NumericDocValuesWriter.SortingNumericDocValues(norms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer getDocValuesReader() {
|
||||
final DocValuesProducer delegate = in.getDocValuesReader();
|
||||
return new DocValuesProducer() {
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
|
||||
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
final BinaryDocValues oldDocValues = delegate.getBinary(field);
|
||||
BinaryDocValuesWriter.CachedBinaryDVs dvs;
|
||||
synchronized (cachedBinaryDVs) {
|
||||
dvs = cachedBinaryDVs.get(field);
|
||||
if (dvs == null) {
|
||||
dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
|
||||
cachedBinaryDVs.put(field.name, dvs);
|
||||
}
|
||||
}
|
||||
return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
|
||||
return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
|
||||
() -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
SortedDocValues oldDocValues = delegate.getSorted(field);
|
||||
int[] ords;
|
||||
synchronized (cachedSortedDVs) {
|
||||
ords = cachedSortedDVs.get(field);
|
||||
if (ords == null) {
|
||||
ords = new int[maxDoc()];
|
||||
Arrays.fill(ords, -1);
|
||||
int docID;
|
||||
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
|
||||
int newDocID = docMap.oldToNew(docID);
|
||||
ords[newDocID] = oldDocValues.ordValue();
|
||||
}
|
||||
cachedSortedDVs.put(field.name, ords);
|
||||
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
|
||||
int[] ords = new int[maxDoc()];
|
||||
Arrays.fill(ords, -1);
|
||||
int docID;
|
||||
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
|
||||
int newDocID = docMap.oldToNew(docID);
|
||||
ords[newDocID] = oldDocValues.ordValue();
|
||||
}
|
||||
}
|
||||
|
||||
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
|
||||
return ords;
|
||||
}));
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
|
||||
SortedNumericDocValuesWriter.LongValues values;
|
||||
synchronized (cachedSortedNumericDVs) {
|
||||
values = cachedSortedNumericDVs.get(field);
|
||||
if (values == null) {
|
||||
values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
|
||||
cachedSortedNumericDVs.put(field.name, values);
|
||||
}
|
||||
}
|
||||
|
||||
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
|
||||
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
|
||||
new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
|
||||
SortedSetDocValuesWriter.DocOrds ords;
|
||||
synchronized (cachedSortedSetDVs) {
|
||||
ords = cachedSortedSetDVs.get(field);
|
||||
if (ords == null) {
|
||||
ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
|
||||
cachedSortedSetDVs.put(field.name, ords);
|
||||
}
|
||||
}
|
||||
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
|
||||
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
|
||||
new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -455,6 +390,18 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
};
|
||||
}
|
||||
|
||||
private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
|
||||
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
|
||||
long[] values = new long[maxDoc()];
|
||||
int docID;
|
||||
while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
|
||||
int newDocID = docMap.oldToNew(docID);
|
||||
docsWithField.set(newDocID);
|
||||
values[newDocID] = oldNumerics.longValue();
|
||||
}
|
||||
return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermVectorsReader getTermVectorsReader() {
|
||||
return newTermVectorsReader(in.getTermVectorsReader());
|
||||
|
@ -510,4 +457,51 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
return metaData;
|
||||
}
|
||||
|
||||
// we try to cache the last used DV or Norms instance since during merge
|
||||
// this instance is used more than once. We could in addition to this single instance
|
||||
// also cache the fields that are used for sorting since we do the work twice for these fields
|
||||
private String cachedField;
|
||||
private Object cachedObject;
|
||||
private boolean cacheIsNorms;
|
||||
|
||||
private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
|
||||
return getOrCreate(field, true, supplier);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private synchronized <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
|
||||
if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
|
||||
assert assertCreatedOnlyOnce(field, norms);
|
||||
cachedObject = supplier.get();
|
||||
cachedField = field;
|
||||
cacheIsNorms = norms;
|
||||
}
|
||||
assert cachedObject != null;
|
||||
return (T) cachedObject;
|
||||
}
|
||||
|
||||
private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
|
||||
private boolean assertCreatedOnlyOnce(String field, boolean norms) {
|
||||
assert Thread.holdsLock(this);
|
||||
// this is mainly there to make sure we change anything in the way we merge we realize it early
|
||||
Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
|
||||
if (timesCached > 1) {
|
||||
assert norms == false :"[" + field + "] norms must not be cached twice";
|
||||
boolean isSortField = false;
|
||||
for (SortField sf : metaData.getSort().getSort()) {
|
||||
if (field.equals(sf.getField())) {
|
||||
isSortField = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
|
||||
+ timesCached + " times isSortField: " + isSortField;
|
||||
assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
|
||||
return getOrCreate(field, false, supplier);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -115,14 +115,18 @@ public class TestSortingCodecReader extends LuceneTestCase {
|
|||
for (int i = 0; i < numDocs; i++) {
|
||||
int docId = docIds.get(i);
|
||||
Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
|
||||
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
|
||||
doc.add(new LongPoint("id", docId));
|
||||
doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
|
||||
String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
|
||||
doc.add(new TextField("text_field", s, Field.Store.YES));
|
||||
doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
|
||||
doc.add(new TextField("another_text_field", s, Field.Store.YES));
|
||||
doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
|
||||
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
|
||||
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
|
||||
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
|
||||
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
|
||||
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
|
||||
|
||||
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
|
|
Loading…
Reference in New Issue