diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2ef7d62fc7f..ef90961e77f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -106,6 +106,9 @@ Bug Fixes Optimizations +* LUCENE-4512: Additional memory savings for CompressingStoredFieldsIndex.MEMORY_CHUNK + (Adrien Grand, Robert Muir) + * LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets into the skipdata. (Robert Muir) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java index b68bfd99779..c8aa995f8db 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java @@ -95,9 +95,7 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int, CompressingStoredFieldsIndex) */ public CompressingStoredFieldsFormat(CompressionMode compressionMode, int chunkSize) { - this (compressionMode, chunkSize, chunkSize == 1 - ? CompressingStoredFieldsIndex.MEMORY_DOC - : CompressingStoredFieldsIndex.MEMORY_CHUNK); + this (compressionMode, chunkSize, CompressingStoredFieldsIndex.MEMORY_CHUNK); } /** diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndex.java b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndex.java index b7be254afe5..9f037ab5313 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndex.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndex.java @@ -19,13 +19,14 @@ package org.apache.lucene.codecs.compressing; import java.io.Closeable; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; /** @@ -42,7 +43,7 @@ public enum CompressingStoredFieldsIndex { * the start offsets of chunks in the fields data file. *
* This format has no memory overhead and requires at most 1 disk seek to - * locate a document in the fields data file. Use this format in + * locate a document in the fields data file. Use this fields index in * memory-constrained environments. */ DISK_DOC(0) { @@ -57,38 +58,18 @@ public enum CompressingStoredFieldsIndex { }, /** - * For every document in the segment, this format stores the offset of the - * compressed chunk that contains it in the fields data file. - *
- * This fields index format requires at most 8 * numDocs
bytes
- * of memory. Locating a document in the fields data file requires no disk
- * seek. Use this format when blocks are very likely to contain few
- * documents (in particular when chunkSize = 1
).
- */
- MEMORY_DOC(1) {
- @Override
- Writer newWriter(IndexOutput out) throws IOException {
- return new ChunksFieldsIndexWriter(out);
- }
- @Override
- Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
- return new MemoryDocFieldsIndexReader(in, si);
- }
- },
-
- /**
- * For every chunk of compressed documents, this format stores the first doc
+ * For every chunk of compressed documents, this index stores the first doc
* ID of the chunk as well as the start offset of the chunk.
*
- * This fields index format require at most
- * 12 * numChunks
bytes of memory. Locating a document in the
- * fields data file requires no disk seek. Use this format when chunks are
- * likely to contain several documents.
+ * This fields index uses a very compact in-memory representation (up to
+ * 12 * numChunks
bytes, but likely much less) and requires no
+ * disk seek to locate a document in the fields data file. Unless you are
+ * working with very little memory, you should use this instance.
*/
- MEMORY_CHUNK(2) {
+ MEMORY_CHUNK(1) {
@Override
Writer newWriter(IndexOutput out) throws IOException {
- return new ChunksFieldsIndexWriter(out);
+ return new MemoryChunkFieldsIndexWriter(out);
}
@Override
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
@@ -176,45 +157,139 @@ public enum CompressingStoredFieldsIndex {
}
- private static class ChunksFieldsIndexWriter extends Writer {
+ private static class MemoryChunkFieldsIndexWriter extends Writer {
- int numChunks;
+ static final int BLOCK_SIZE = 1024; // number of chunks to serialize at once
+
+ static long moveSignToLowOrderBit(long n) {
+ return (n >> 63) ^ (n << 1);
+ }
+
+ int totalDocs;
+ int blockDocs;
+ int blockChunks;
+ long firstStartPointer;
long maxStartPointer;
- GrowableWriter docBaseDeltas;
- GrowableWriter startPointerDeltas;
+ final int[] docBaseDeltas;
+ final long[] startPointerDeltas;
- ChunksFieldsIndexWriter(IndexOutput indexOutput) {
+ MemoryChunkFieldsIndexWriter(IndexOutput indexOutput) throws IOException {
super(indexOutput);
- numChunks = 0;
- maxStartPointer = 0;
- docBaseDeltas = new GrowableWriter(2, 128, PackedInts.COMPACT);
- startPointerDeltas = new GrowableWriter(5, 128, PackedInts.COMPACT);
+ reset();
+ totalDocs = 0;
+ docBaseDeltas = new int[BLOCK_SIZE];
+ startPointerDeltas = new long[BLOCK_SIZE];
+ fieldsIndexOut.writeVInt(PackedInts.VERSION_CURRENT);
+ }
+
+ private void reset() {
+ blockChunks = 0;
+ blockDocs = 0;
+ firstStartPointer = -1; // means unset
+ }
+
+ private void writeBlock() throws IOException {
+ assert blockChunks > 0;
+ fieldsIndexOut.writeVInt(blockChunks);
+
+ // The trick here is that we only store the difference from the average start
+ // pointer or doc base, this helps save bits per value.
+ // And in order to prevent a few chunks that would be far from the average to
+ // raise the number of bits per value for all of them, we only encode blocks
+ // of 1024 chunks at once
+ // See LUCENE-4512
+
+ // doc bases
+ final int avgChunkDocs;
+ if (blockChunks == 1) {
+ avgChunkDocs = 0;
+ } else {
+ avgChunkDocs = Math.round((float) (blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1));
+ }
+ fieldsIndexOut.writeVInt(totalDocs - blockDocs); // docBase
+ fieldsIndexOut.writeVInt(avgChunkDocs);
+ int docBase = 0;
+ long maxDelta = 0;
+ for (int i = 0; i < blockChunks; ++i) {
+ final int delta = docBase - avgChunkDocs * i;
+ maxDelta |= moveSignToLowOrderBit(delta);
+ docBase += docBaseDeltas[i];
+ }
+
+ final int bitsPerDocBase = PackedInts.bitsRequired(maxDelta);
+ fieldsIndexOut.writeVInt(bitsPerDocBase);
+ PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsIndexOut,
+ PackedInts.Format.PACKED, blockChunks, bitsPerDocBase, 1);
+ docBase = 0;
+ for (int i = 0; i < blockChunks; ++i) {
+ final long delta = docBase - avgChunkDocs * i;
+ assert PackedInts.bitsRequired(moveSignToLowOrderBit(delta)) <= writer.bitsPerValue();
+ writer.add(moveSignToLowOrderBit(delta));
+ docBase += docBaseDeltas[i];
+ }
+ writer.finish();
+
+ // start pointers
+ fieldsIndexOut.writeVLong(firstStartPointer);
+ final long avgChunkSize;
+ if (blockChunks == 1) {
+ avgChunkSize = 0;
+ } else {
+ avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1);
+ }
+ fieldsIndexOut.writeVLong(avgChunkSize);
+ long startPointer = 0;
+ maxDelta = 0;
+ for (int i = 0; i < blockChunks; ++i) {
+ startPointer += startPointerDeltas[i];
+ final long delta = startPointer - avgChunkSize * i;
+ maxDelta |= moveSignToLowOrderBit(delta);
+ }
+
+ final int bitsPerStartPointer = PackedInts.bitsRequired(maxDelta);
+ fieldsIndexOut.writeVInt(bitsPerStartPointer);
+ writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED,
+ blockChunks, bitsPerStartPointer, 1);
+ startPointer = 0;
+ for (int i = 0; i < blockChunks; ++i) {
+ startPointer += startPointerDeltas[i];
+ final long delta = startPointer - avgChunkSize * i;
+ assert PackedInts.bitsRequired(moveSignToLowOrderBit(delta)) <= writer.bitsPerValue();
+ writer.add(moveSignToLowOrderBit(delta));
+ }
+ writer.finish();
}
@Override
void writeIndex(int numDocs, long startPointer) throws IOException {
- if (numChunks == docBaseDeltas.size()) {
- final int newSize = ArrayUtil.oversize(numChunks + 1, 1);
- docBaseDeltas = docBaseDeltas.resize(newSize);
- startPointerDeltas = startPointerDeltas.resize(newSize);
+ if (blockChunks == BLOCK_SIZE) {
+ writeBlock();
+ reset();
}
- docBaseDeltas.set(numChunks, numDocs);
- startPointerDeltas.set(numChunks, startPointer - maxStartPointer);
- ++numChunks;
+ if (firstStartPointer == -1) {
+ firstStartPointer = maxStartPointer = startPointer;
+ }
+ assert firstStartPointer > 0 && startPointer >= firstStartPointer;
+
+ docBaseDeltas[blockChunks] = numDocs;
+ startPointerDeltas[blockChunks] = startPointer - maxStartPointer;
+
+ ++blockChunks;
+ blockDocs += numDocs;
+ totalDocs += numDocs;
maxStartPointer = startPointer;
}
@Override
void finish(int numDocs) throws IOException {
- if (numChunks != docBaseDeltas.size()) {
- docBaseDeltas = docBaseDeltas.resize(numChunks);
- startPointerDeltas = startPointerDeltas.resize(numChunks);
+ if (numDocs != totalDocs) {
+ throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
}
- fieldsIndexOut.writeVInt(numChunks);
- fieldsIndexOut.writeByte((byte) PackedInts.bitsRequired(maxStartPointer));
- docBaseDeltas.save(fieldsIndexOut);
- startPointerDeltas.save(fieldsIndexOut);
+ if (blockChunks > 0) {
+ writeBlock();
+ }
+ fieldsIndexOut.writeVInt(0); // end marker
}
}
@@ -231,9 +306,7 @@ public enum CompressingStoredFieldsIndex {
abstract long getStartPointer(int docID) throws IOException;
public void close() throws IOException {
- if (fieldsIndexIn != null) {
- fieldsIndexIn.close();
- }
+ IOUtils.close(fieldsIndexIn);
}
public abstract Reader clone();
@@ -271,130 +344,141 @@ public enum CompressingStoredFieldsIndex {
}
- private static class MemoryDocFieldsIndexReader extends Reader {
-
- private final PackedInts.Reader startPointers;
-
- MemoryDocFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
- super(fieldsIndexIn);
- final int numChunks = fieldsIndexIn.readVInt();
- final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
- if (bitsPerStartPointer > 64) {
- throw new CorruptIndexException("Corrupted");
- }
-
- final PackedInts.Reader chunkDocs = PackedInts.getReader(fieldsIndexIn);
- if (chunkDocs.size() != numChunks) {
- throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + chunkDocs.size());
- }
-
- final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
- if (startPointerDeltas.size() != numChunks) {
- throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
- }
- final PackedInts.Mutable startPointers = PackedInts.getMutable(si.getDocCount(), bitsPerStartPointer, PackedInts.COMPACT);
- int docID = 0;
- long startPointer = 0;
- for (int i = 0; i < numChunks; ++i) {
- startPointer += startPointerDeltas.next();
- final int chunkDocCount = (int) chunkDocs.get(i);
- for (int j = 0; j < chunkDocCount; ++j) {
- startPointers.set(docID++, startPointer);
- }
- }
- if (docID != si.getDocCount()) {
- throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docID);
- }
-
- this.startPointers = startPointers;
- }
-
- private MemoryDocFieldsIndexReader(PackedInts.Reader startPointers) {
- super(null);
- this.startPointers = startPointers;
- }
-
- @Override
- long getStartPointer(int docID) throws IOException {
- return startPointers.get(docID);
- }
-
- @Override
- public Reader clone() {
- if (fieldsIndexIn == null) {
- return this;
- } else {
- return new MemoryDocFieldsIndexReader(startPointers);
- }
- }
-
- }
-
private static class MemoryChunkFieldsIndexReader extends Reader {
- private final PackedInts.Reader docBases;
- private final PackedInts.Reader startPointers;
+ static long moveLowOrderBitToSign(long n) {
+ return ((n >>> 1) ^ -(n & 1));
+ }
- MemoryChunkFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
+ private final int maxDoc;
+ private final int[] docBases;
+ private final long[] startPointers;
+ private final int[] avgChunkDocs;
+ private final long[] avgChunkSizes;
+ private final PackedInts.Reader[] docBasesDeltas; // delta from the avg
+ private final PackedInts.Reader[] startPointersDeltas; // delta from the avg
+
+ MemoryChunkFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
super(fieldsIndexIn);
- final int numChunks = fieldsIndexIn.readVInt();
- final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
- if (bitsPerStartPointer > 64) {
- throw new CorruptIndexException("Corrupted");
+ maxDoc = si.getDocCount();
+ int[] docBases = new int[16];
+ long[] startPointers = new long[16];
+ int[] avgChunkDocs = new int[16];
+ long[] avgChunkSizes = new long[16];
+ PackedInts.Reader[] docBasesDeltas = new PackedInts.Reader[16];
+ PackedInts.Reader[] startPointersDeltas = new PackedInts.Reader[16];
+
+ final int packedIntsVersion = fieldsIndexIn.readVInt();
+
+ int blockCount = 0;
+
+ for (;;) {
+ final int numChunks = fieldsIndexIn.readVInt();
+ if (numChunks == 0) {
+ break;
+ }
+ if (blockCount == docBases.length) {
+ final int newSize = ArrayUtil.oversize(blockCount + 1, 8);
+ docBases = Arrays.copyOf(docBases, newSize);
+ startPointers = Arrays.copyOf(startPointers, newSize);
+ avgChunkDocs = Arrays.copyOf(avgChunkDocs, newSize);
+ avgChunkSizes = Arrays.copyOf(avgChunkSizes, newSize);
+ docBasesDeltas = Arrays.copyOf(docBasesDeltas, newSize);
+ startPointersDeltas = Arrays.copyOf(startPointersDeltas, newSize);
+ }
+
+ // doc bases
+ docBases[blockCount] = fieldsIndexIn.readVInt();
+ avgChunkDocs[blockCount] = fieldsIndexIn.readVInt();
+ final int bitsPerDocBase = fieldsIndexIn.readVInt();
+ if (bitsPerDocBase > 32) {
+ throw new CorruptIndexException("Corrupted");
+ }
+ docBasesDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerDocBase);
+
+ // start pointers
+ startPointers[blockCount] = fieldsIndexIn.readVLong();
+ avgChunkSizes[blockCount] = fieldsIndexIn.readVLong();
+ final int bitsPerStartPointer = fieldsIndexIn.readVInt();
+ if (bitsPerStartPointer > 64) {
+ throw new CorruptIndexException("Corrupted");
+ }
+ startPointersDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerStartPointer);
+
+ ++blockCount;
}
- final PackedInts.ReaderIterator docBaseDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
- if (docBaseDeltas.size() != numChunks) {
- throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + docBaseDeltas.size());
- }
- final PackedInts.Mutable docBases = PackedInts.getMutable(numChunks, PackedInts.bitsRequired(Math.max(0, si.getDocCount() - 1)), PackedInts.COMPACT);
-
- int docBase = 0;
- for (int i = 0; i < numChunks; ++i) {
- docBases.set(i, docBase);
- docBase += docBaseDeltas.next();
- }
- if (docBase != si.getDocCount()) {
- throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docBase);
- }
-
- final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
- if (startPointerDeltas.size() != numChunks) {
- throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
- }
- final PackedInts.Mutable startPointers = PackedInts.getMutable(numChunks, bitsPerStartPointer, PackedInts.COMPACT);
- long startPointer = 0;
- for (int i = 0; i < numChunks; ++i) {
- startPointer += startPointerDeltas.next();
- startPointers.set(i, startPointer);
- }
-
- this.docBases = docBases;
- this.startPointers = startPointers;
+ this.docBases = Arrays.copyOf(docBases, blockCount);
+ this.startPointers = Arrays.copyOf(startPointers, blockCount);
+ this.avgChunkDocs = Arrays.copyOf(avgChunkDocs, blockCount);
+ this.avgChunkSizes = Arrays.copyOf(avgChunkSizes, blockCount);
+ this.docBasesDeltas = Arrays.copyOf(docBasesDeltas, blockCount);
+ this.startPointersDeltas = Arrays.copyOf(startPointersDeltas, blockCount);
}
- private MemoryChunkFieldsIndexReader(PackedInts.Reader docBases, PackedInts.Reader startPointers) {
+ private MemoryChunkFieldsIndexReader(MemoryChunkFieldsIndexReader other) {
super(null);
- this.docBases = docBases;
- this.startPointers = startPointers;
+ this.maxDoc = other.maxDoc;
+ this.docBases = other.docBases;
+ this.startPointers = other.startPointers;
+ this.avgChunkDocs = other.avgChunkDocs;
+ this.avgChunkSizes = other.avgChunkSizes;
+ this.docBasesDeltas = other.docBasesDeltas;
+ this.startPointersDeltas = other.startPointersDeltas;
}
- @Override
- long getStartPointer(int docID) {
- assert docBases.size() > 0;
- int lo = 0, hi = docBases.size() - 1;
+ private int block(int docID) {
+ int lo = 0, hi = docBases.length - 1;
while (lo <= hi) {
final int mid = (lo + hi) >>> 1;
- final long midValue = docBases.get(mid);
+ final int midValue = docBases[mid];
if (midValue == docID) {
- return startPointers.get(mid);
+ return mid;
} else if (midValue < docID) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
- return startPointers.get(hi);
+ return hi;
+ }
+
+ private int relativeDocBase(int block, int relativeChunk) {
+ final int expected = avgChunkDocs[block] * relativeChunk;
+ final long delta = moveLowOrderBitToSign(docBasesDeltas[block].get(relativeChunk));
+ return expected + (int) delta;
+ }
+
+ private long relativeStartPointer(int block, int relativeChunk) {
+ final long expected = avgChunkSizes[block] * relativeChunk;
+ final long delta = moveLowOrderBitToSign(startPointersDeltas[block].get(relativeChunk));
+ return expected + delta;
+ }
+
+ private int relativeChunk(int block, int relativeDoc) {
+ int lo = 0, hi = docBasesDeltas[block].size() - 1;
+ while (lo <= hi) {
+ final int mid = (lo + hi) >>> 1;
+ final int midValue = relativeDocBase(block, mid);
+ if (midValue == relativeDoc) {
+ return mid;
+ } else if (midValue < relativeDoc) {
+ lo = mid + 1;
+ } else {
+ hi = mid - 1;
+ }
+ }
+ return hi;
+ }
+
+ @Override
+ long getStartPointer(int docID) {
+ if (docID < 0 || docID >= maxDoc) {
+ throw new IllegalArgumentException("docID out of range [0-" + maxDoc + "]: " + docID);
+ }
+ final int block = block(docID);
+ final int relativeChunk = relativeChunk(block, docID - docBases[block]);
+ return startPointers[block] + relativeStartPointer(block, relativeChunk);
}
@Override
@@ -402,7 +486,7 @@ public enum CompressingStoredFieldsIndex {
if (fieldsIndexIn == null) {
return this;
} else {
- return new MemoryChunkFieldsIndexReader(docBases, startPointers);
+ return new MemoryChunkFieldsIndexReader(this);
}
}