LUCENE-7545: Dense norms/doc-values should not consume memory for the IW buffer.

This commit is contained in:
Adrien Grand 2016-11-10 14:04:15 +01:00
parent 2902727a15
commit c415bc8d1d
8 changed files with 178 additions and 54 deletions

View File

@ -24,11 +24,9 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -48,7 +46,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
private final Counter iwBytesUsed;
private final PackedLongValues.Builder lengths;
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private final FieldInfo fieldInfo;
private long bytesUsed;
private int lastDocID = -1;
@ -60,7 +58,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
this.bytesOut = bytes.getDataOutput();
this.lengths = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
this.iwBytesUsed = iwBytesUsed;
this.docsWithField = new FixedBitSet(64);
this.docsWithField = new DocsWithFieldSet();
this.bytesUsed = lengths.ramBytesUsed() + docsWithField.ramBytesUsed();
iwBytesUsed.addAndGet(bytesUsed);
}
@ -84,8 +82,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
// Should never happen!
throw new RuntimeException(ioe);
}
docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID);
docsWithField.set(docID);
docsWithField.add(docID);
updateBytesUsed();
lastDocID = docID;
@ -112,7 +109,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField);
return new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
}
});
}
@ -124,12 +121,12 @@ class BinaryDocValuesWriter extends DocValuesWriter {
final DocIdSetIterator docsWithField;
final DataInput bytesIterator;
BufferedBinaryDocValues(PackedLongValues lengths, int maxLength, DataInput bytesIterator, FixedBitSet docsWithFields) {
BufferedBinaryDocValues(PackedLongValues lengths, int maxLength, DataInput bytesIterator, DocIdSetIterator docsWithFields) {
this.value = new BytesRefBuilder();
this.value.grow(maxLength);
this.lengthsIterator = lengths.iterator();
this.bytesIterator = bytesIterator;
this.docsWithField = new BitSetIterator(docsWithFields, lengths.size());
this.docsWithField = docsWithFields;
}
@Override

View File

@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
/** Accumulator for documents that have a value for a field. This is optimized
* for the case that all documents have a value. */
final class DocsWithFieldSet extends DocIdSet {
private static long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class);
private FixedBitSet set;
private int cost = 0;
private int lastDocId = -1;
void add(int docID) {
if (docID <= lastDocId) {
throw new IllegalArgumentException("Out of order doc ids: last=" + lastDocId + ", next=" + docID);
}
if (set != null) {
set = FixedBitSet.ensureCapacity(set, docID);
set.set(docID);
} else if (docID != cost) {
// migrate to a sparse encoding using a bit set
set = new FixedBitSet(docID + 1);
set.set(0, cost);
set.set(docID);
}
lastDocId = docID;
cost++;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + (set == null ? 0 : set.ramBytesUsed());
}
@Override
public DocIdSetIterator iterator() {
return set != null ? new BitSetIterator(set, cost) : DocIdSetIterator.all(cost);
}
}

View File

@ -22,9 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -32,7 +30,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
* segment flushes. */
class NormValuesWriter {
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private PackedLongValues.Builder pending;
private final Counter iwBytesUsed;
private long bytesUsed;
@ -40,7 +38,7 @@ class NormValuesWriter {
private int lastDocID = -1;
public NormValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
docsWithField = new FixedBitSet(64);
docsWithField = new DocsWithFieldSet();
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
this.fieldInfo = fieldInfo;
@ -54,8 +52,7 @@ class NormValuesWriter {
}
pending.add(value);
docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID);
docsWithField.set(docID);
docsWithField.add(docID);
updateBytesUsed();
@ -82,7 +79,7 @@ class NormValuesWriter {
if (fieldInfo != NormValuesWriter.this.fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedNorms(values, docsWithField);
return new BufferedNorms(values, docsWithField.iterator());
}
@Override
@ -108,9 +105,9 @@ class NormValuesWriter {
final DocIdSetIterator docsWithField;
private long value;
BufferedNorms(PackedLongValues values, FixedBitSet docsWithFields) {
BufferedNorms(PackedLongValues values, DocIdSetIterator docsWithFields) {
this.iter = values.iterator();
this.docsWithField = new BitSetIterator(docsWithFields, values.size());
this.docsWithField = docsWithFields;
}
@Override

View File

@ -21,9 +21,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -34,13 +32,13 @@ class NumericDocValuesWriter extends DocValuesWriter {
private PackedLongValues.Builder pending;
private final Counter iwBytesUsed;
private long bytesUsed;
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private final FieldInfo fieldInfo;
private int lastDocID = -1;
public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new FixedBitSet(64);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
@ -53,8 +51,7 @@ class NumericDocValuesWriter extends DocValuesWriter {
}
pending.add(value);
docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID);
docsWithField.set(docID);
docsWithField.add(docID);
updateBytesUsed();
@ -83,7 +80,7 @@ class NumericDocValuesWriter extends DocValuesWriter {
if (fieldInfo != NumericDocValuesWriter.this.fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedNumericDocValues(values, docsWithField);
return new BufferedNumericDocValues(values, docsWithField.iterator());
}
});
}
@ -94,9 +91,9 @@ class NumericDocValuesWriter extends DocValuesWriter {
final DocIdSetIterator docsWithField;
private long value;
BufferedNumericDocValues(PackedLongValues values, FixedBitSet docsWithFields) {
BufferedNumericDocValues(PackedLongValues values, DocIdSetIterator docsWithFields) {
this.iter = values.iterator();
this.docsWithField = new BitSetIterator(docsWithFields, values.size());
this.docsWithField = docsWithFields;
}
@Override

View File

@ -22,13 +22,11 @@ import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -37,7 +35,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
class SortedDocValuesWriter extends DocValuesWriter {
final BytesRefHash hash;
private PackedLongValues.Builder pending;
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this currently only tracks differences in 'pending'
private final FieldInfo fieldInfo;
@ -52,7 +50,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
BytesRefHash.DEFAULT_CAPACITY,
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new FixedBitSet(64);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
iwBytesUsed.addAndGet(bytesUsed);
}
@ -69,8 +67,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
}
addOneValue(value);
docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID);
docsWithField.set(docID);
docsWithField.add(docID);
lastDocID = docID;
}
@ -121,7 +118,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField);
return new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField.iterator());
}
});
}
@ -136,13 +133,13 @@ class SortedDocValuesWriter extends DocValuesWriter {
final PackedLongValues.Iterator iter;
final DocIdSetIterator docsWithField;
public BufferedSortedDocValues(BytesRefHash hash, int valueCount, PackedLongValues docToOrd, int[] sortedValues, int[] ordMap, FixedBitSet docsWithField) {
public BufferedSortedDocValues(BytesRefHash hash, int valueCount, PackedLongValues docToOrd, int[] sortedValues, int[] ordMap, DocIdSetIterator docsWithField) {
this.hash = hash;
this.valueCount = valueCount;
this.sortedValues = sortedValues;
this.iter = docToOrd.iterator();
this.ordMap = ordMap;
this.docsWithField = new BitSetIterator(docsWithField, docToOrd.size());
this.docsWithField = docsWithField;
}
@Override

View File

@ -23,9 +23,7 @@ import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -34,7 +32,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
class SortedNumericDocValuesWriter extends DocValuesWriter {
private PackedLongValues.Builder pending; // stream of all values
private PackedLongValues.Builder pendingCounts; // count of values per doc
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
@ -47,7 +45,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
this.iwBytesUsed = iwBytesUsed;
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new FixedBitSet(64);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed() + docsWithField.ramBytesUsed() + RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed);
}
@ -76,8 +74,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
pendingCounts.add(currentUpto);
currentUpto = 0;
docsWithField = FixedBitSet.ensureCapacity(docsWithField, currentDoc);
docsWithField.set(currentDoc);
docsWithField.add(currentDoc);
}
@Override
@ -112,7 +109,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField);
return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
}
});
}
@ -124,10 +121,10 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
private int valueCount;
private int valueUpto;
public BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, FixedBitSet docsWithField) {
public BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, DocIdSetIterator docsWithField) {
valuesIter = values.iterator();
valueCountsIter = valueCounts.iterator();
this.docsWithField = new BitSetIterator(docsWithField, values.size());
this.docsWithField = docsWithField;
}
@Override

View File

@ -24,13 +24,11 @@ import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@ -40,7 +38,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
final BytesRefHash hash;
private PackedLongValues.Builder pending; // stream of all termIDs
private PackedLongValues.Builder pendingCounts; // termIDs per doc
private FixedBitSet docsWithField;
private DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
@ -59,7 +57,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new FixedBitSet(64);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed();
iwBytesUsed.addAndGet(bytesUsed);
}
@ -103,8 +101,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
pendingCounts.add(count);
maxCount = Math.max(maxCount, count);
currentUpto = 0;
docsWithField = FixedBitSet.ensureCapacity(docsWithField, currentDoc);
docsWithField.set(currentDoc);
docsWithField.add(currentDoc);
}
@Override
@ -158,7 +155,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
return new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField);
return new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator());
}
});
}
@ -176,14 +173,14 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
private int ordCount;
private int ordUpto;
public BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, FixedBitSet docsWithField) {
public BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, DocIdSetIterator docsWithField) {
this.currentDoc = new int[maxCount];
this.sortedValues = sortedValues;
this.ordMap = ordMap;
this.hash = hash;
this.ordsIter = ords.iterator();
this.ordCountsIter = ordCounts.iterator();
this.docsWithField = new BitSetIterator(docsWithField, ordCounts.size());
this.docsWithField = docsWithField;
}
@Override

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestDocsWithFieldSet extends LuceneTestCase {
public void testDense() throws IOException {
DocsWithFieldSet set = new DocsWithFieldSet();
DocIdSetIterator it = set.iterator();
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
set.add(0);
it = set.iterator();
assertEquals(0, it.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
long ramBytesUsed = set.ramBytesUsed();
for (int i = 1; i < 1000; ++i) {
set.add(i);
}
assertEquals(ramBytesUsed, set.ramBytesUsed());
it = set.iterator();
for (int i = 0; i < 1000; ++i) {
assertEquals(i, it.nextDoc());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
}
public void testSparse() throws IOException {
DocsWithFieldSet set = new DocsWithFieldSet();
int doc = random().nextInt(10000);
set.add(doc);
DocIdSetIterator it = set.iterator();
assertEquals(doc, it.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
int doc2 = doc + TestUtil.nextInt(random(), 1, 100);
set.add(doc2);
it = set.iterator();
assertEquals(doc, it.nextDoc());
assertEquals(doc2, it.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
}
public void testDenseThenSparse() throws IOException {
int denseCount = random().nextInt(10000);
int nextDoc = denseCount + random().nextInt(10000);
DocsWithFieldSet set = new DocsWithFieldSet();
for (int i = 0; i < denseCount; ++i) {
set.add(i);
}
set.add(nextDoc);
DocIdSetIterator it = set.iterator();
for (int i = 0; i < denseCount; ++i) {
assertEquals(i, it.nextDoc());
}
assertEquals(nextDoc, it.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc());
}
}