mirror of https://github.com/apache/lucene.git
Fully reuse postings enums when flushing sorted indexes. (#12206)
Currently we're only half reusing postings enums when flushing sorted indexes as we still create new wrapper instances every time, which can be costly with fields that have many terms.
This commit is contained in:
parent
d3b6ef3c86
commit
0782535017
|
@ -188,7 +188,7 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
wrapReuse = (SortingPostingsEnum) reuse;
|
wrapReuse = (SortingPostingsEnum) reuse;
|
||||||
inReuse = wrapReuse.getWrapped();
|
inReuse = wrapReuse.getWrapped();
|
||||||
} else {
|
} else {
|
||||||
wrapReuse = null;
|
wrapReuse = new SortingPostingsEnum();
|
||||||
inReuse = reuse;
|
inReuse = reuse;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,8 +201,8 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
final boolean storeOffsets =
|
final boolean storeOffsets =
|
||||||
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
return new SortingPostingsEnum(
|
wrapReuse.reset(docMap, inDocsAndPositions, storePositions, storeOffsets);
|
||||||
docMap.size(), wrapReuse, inDocsAndPositions, docMap, storePositions, storeOffsets);
|
return wrapReuse;
|
||||||
}
|
}
|
||||||
|
|
||||||
final PostingsEnum inReuse;
|
final PostingsEnum inReuse;
|
||||||
|
@ -213,33 +213,29 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
wrapReuse = (SortingDocsEnum) reuse;
|
wrapReuse = (SortingDocsEnum) reuse;
|
||||||
inReuse = wrapReuse.getWrapped();
|
inReuse = wrapReuse.getWrapped();
|
||||||
} else {
|
} else {
|
||||||
wrapReuse = null;
|
wrapReuse = new SortingDocsEnum();
|
||||||
inReuse = null;
|
inReuse = reuse;
|
||||||
}
|
}
|
||||||
|
|
||||||
final PostingsEnum inDocs = in.postings(inReuse, flags);
|
final PostingsEnum inDocs = in.postings(inReuse, flags);
|
||||||
return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, docMap);
|
wrapReuse.reset(docMap, inDocs);
|
||||||
|
return wrapReuse;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class SortingDocsEnum extends PostingsEnum {
|
static class SortingDocsEnum extends PostingsEnum {
|
||||||
|
|
||||||
private final PostingsEnum in;
|
|
||||||
private final LSBRadixSorter sorter;
|
private final LSBRadixSorter sorter;
|
||||||
private int[] docs;
|
private PostingsEnum in;
|
||||||
private int docIt = -1;
|
private int[] docs = IntsRef.EMPTY_INTS;
|
||||||
private final int upTo;
|
private int docIt;
|
||||||
|
private int upTo;
|
||||||
|
|
||||||
SortingDocsEnum(
|
SortingDocsEnum() {
|
||||||
int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, final Sorter.DocMap docMap)
|
sorter = new LSBRadixSorter();
|
||||||
throws IOException {
|
}
|
||||||
if (reuse != null) {
|
|
||||||
sorter = reuse.sorter;
|
void reset(Sorter.DocMap docMap, PostingsEnum in) throws IOException {
|
||||||
docs = reuse.docs;
|
|
||||||
} else {
|
|
||||||
sorter = new LSBRadixSorter();
|
|
||||||
docs = IntsRef.EMPTY_INTS;
|
|
||||||
}
|
|
||||||
this.in = in;
|
this.in = in;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
|
for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
|
||||||
|
@ -253,10 +249,12 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
docs = ArrayUtil.grow(docs);
|
docs = ArrayUtil.grow(docs);
|
||||||
}
|
}
|
||||||
docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
|
docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
final int maxDoc = docMap.size();
|
||||||
final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
|
final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
|
||||||
// Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
|
// Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
|
||||||
// often still faster for nearly-sorted inputs.
|
// often still faster for nearly-sorted inputs.
|
||||||
sorter.sort(numBits, docs, upTo);
|
sorter.sort(numBits, docs, upTo);
|
||||||
|
docIt = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
PostingsEnum getWrapped() {
|
PostingsEnum getWrapped() {
|
||||||
|
@ -311,7 +309,7 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class SortingPostingsEnum extends FilterLeafReader.FilterPostingsEnum {
|
static class SortingPostingsEnum extends PostingsEnum {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
|
* A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
|
||||||
|
@ -324,8 +322,8 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
private int[] tmpDocs;
|
private int[] tmpDocs;
|
||||||
private long[] tmpOffsets;
|
private long[] tmpOffsets;
|
||||||
|
|
||||||
public DocOffsetSorter(int maxDoc) {
|
public DocOffsetSorter(int numTempSlots) {
|
||||||
super(maxDoc / 8);
|
super(numTempSlots);
|
||||||
this.tmpDocs = IntsRef.EMPTY_INTS;
|
this.tmpDocs = IntsRef.EMPTY_INTS;
|
||||||
this.tmpOffsets = LongsRef.EMPTY_LONGS;
|
this.tmpOffsets = LongsRef.EMPTY_LONGS;
|
||||||
}
|
}
|
||||||
|
@ -379,55 +377,38 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final int maxDoc;
|
private DocOffsetSorter sorter;
|
||||||
private final DocOffsetSorter sorter;
|
private int[] docs = IntsRef.EMPTY_INTS;
|
||||||
private int[] docs;
|
private long[] offsets = LongsRef.EMPTY_LONGS;
|
||||||
private long[] offsets;
|
private int upto;
|
||||||
private final int upto;
|
|
||||||
|
|
||||||
private final ByteBuffersDataInput postingInput;
|
private ByteBuffersDataInput postingInput;
|
||||||
private final boolean storePositions, storeOffsets;
|
private PostingsEnum in;
|
||||||
|
private boolean storePositions, storeOffsets;
|
||||||
|
|
||||||
private int docIt = -1;
|
private int docIt;
|
||||||
private int pos;
|
private int pos;
|
||||||
private int startOffset = -1;
|
private int startOffset;
|
||||||
private int endOffset = -1;
|
private int endOffset;
|
||||||
private final BytesRef payload;
|
private final BytesRef payload = new BytesRef();
|
||||||
private int currFreq;
|
private int currFreq;
|
||||||
|
|
||||||
private final ByteBuffersDataOutput buffer;
|
private final ByteBuffersDataOutput buffer = ByteBuffersDataOutput.newResettableInstance();
|
||||||
|
|
||||||
SortingPostingsEnum(
|
void reset(Sorter.DocMap docMap, PostingsEnum in, boolean storePositions, boolean storeOffsets)
|
||||||
int maxDoc,
|
|
||||||
SortingPostingsEnum reuse,
|
|
||||||
final PostingsEnum in,
|
|
||||||
Sorter.DocMap docMap,
|
|
||||||
boolean storePositions,
|
|
||||||
boolean storeOffsets)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(in);
|
this.in = in;
|
||||||
this.maxDoc = maxDoc;
|
|
||||||
this.storePositions = storePositions;
|
this.storePositions = storePositions;
|
||||||
this.storeOffsets = storeOffsets;
|
this.storeOffsets = storeOffsets;
|
||||||
if (reuse != null) {
|
if (sorter == null) {
|
||||||
docs = reuse.docs;
|
final int numTempSlots = docMap.size() / 8;
|
||||||
offsets = reuse.offsets;
|
sorter = new DocOffsetSorter(numTempSlots);
|
||||||
payload = reuse.payload;
|
|
||||||
buffer = reuse.buffer;
|
|
||||||
buffer.reset();
|
|
||||||
if (reuse.maxDoc == maxDoc) {
|
|
||||||
sorter = reuse.sorter;
|
|
||||||
} else {
|
|
||||||
sorter = new DocOffsetSorter(maxDoc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
docs = new int[32];
|
|
||||||
offsets = new long[32];
|
|
||||||
payload = new BytesRef(32);
|
|
||||||
buffer = ByteBuffersDataOutput.newResettableInstance();
|
|
||||||
sorter = new DocOffsetSorter(maxDoc);
|
|
||||||
}
|
}
|
||||||
|
docIt = -1;
|
||||||
|
startOffset = -1;
|
||||||
|
endOffset = -1;
|
||||||
|
|
||||||
|
buffer.reset();
|
||||||
int doc;
|
int doc;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
@ -548,5 +529,10 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
PostingsEnum getWrapped() {
|
PostingsEnum getWrapped() {
|
||||||
return in;
|
return in;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return in.cost();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,10 +55,10 @@ public final class ByteBuffersDataInput extends DataInput
|
||||||
public ByteBuffersDataInput(List<ByteBuffer> buffers) {
|
public ByteBuffersDataInput(List<ByteBuffer> buffers) {
|
||||||
ensureAssumptions(buffers);
|
ensureAssumptions(buffers);
|
||||||
|
|
||||||
this.blocks =
|
this.blocks = buffers.toArray(ByteBuffer[]::new);
|
||||||
buffers.stream()
|
for (int i = 0; i < blocks.length; ++i) {
|
||||||
.map(buf -> buf.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN))
|
blocks[i] = blocks[i].asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||||
.toArray(ByteBuffer[]::new);
|
}
|
||||||
// pre-allocate these arrays and create the view buffers lazily
|
// pre-allocate these arrays and create the view buffers lazily
|
||||||
this.floatBuffers = new FloatBuffer[blocks.length * Float.BYTES];
|
this.floatBuffers = new FloatBuffer[blocks.length * Float.BYTES];
|
||||||
this.longBuffers = new LongBuffer[blocks.length * Long.BYTES];
|
this.longBuffers = new LongBuffer[blocks.length * Long.BYTES];
|
||||||
|
@ -71,7 +71,11 @@ public final class ByteBuffersDataInput extends DataInput
|
||||||
this.blockMask = (1 << blockBits) - 1;
|
this.blockMask = (1 << blockBits) - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.size = Arrays.stream(blocks).mapToLong(block -> block.remaining()).sum();
|
long size = 0;
|
||||||
|
for (ByteBuffer block : blocks) {
|
||||||
|
size += block.remaining();
|
||||||
|
}
|
||||||
|
this.size = size;
|
||||||
|
|
||||||
// The initial "position" of this stream is shifted by the position of the first block.
|
// The initial "position" of this stream is shifted by the position of the first block.
|
||||||
this.offset = blocks[0].position();
|
this.offset = blocks[0].position();
|
||||||
|
|
Loading…
Reference in New Issue