LUCENE-4226: Efficient stored fields compression.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1394578 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2012-10-05 15:14:35 +00:00
parent 1d521aa6bf
commit c3e00c353e
21 changed files with 2770 additions and 1 deletions

View File

@ -20,6 +20,10 @@ Changes in backwards compatibility policy
New Features
* LUCENE-4226: New experimental StoredFieldsFormat (in lucene/codecs) that
compresses chunks of documents together in order to improve the compression
ratio. (Adrien Grand)
* LUCENE-4426: New ValueSource implementations (in lucene/queries) for
DocValues fields. (Adrien Grand)

View File

@ -0,0 +1,121 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
/**
* A {@link StoredFieldsFormat} that is very similar to
* {@link Lucene40StoredFieldsFormat} but compresses documents in chunks in
* order to improve compression ratio.
* <p>
* For optimal performance, you should use a {@link MergePolicy} that returns
* segments that have the biggest byte size first.
* @lucene.experimental
*/
public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
private final CompressingStoredFieldsIndex storedFieldsIndexFormat;
private final CompressionMode compressionMode;
private final int chunkSize;
/**
* Create a new {@link CompressingStoredFieldsFormat}.
* <p>
* The <code>compressionMode</code> parameter allows you to choose between
* compression algorithms that have various compression and uncompression
* speeds so that you can pick the one that best fits your indexing and
* searching throughput.
* <p>
* <code>chunkSize</code> is the minimum byte size of a chunk of documents.
* A value of <code>1</code> can make sense if there is redundancy across
* fields. In that case, both performance and compression ratio should be
* better than with {@link Lucene40StoredFieldsFormat} with compressed
* fields.
* <p>
* Higher values of <code>chunkSize</code> should improve the compression
* atio but will require more memory at indexing time and might make document
* loading a little slower (depending on the size of your OS cache compared
* to the size of your index).
* <p>
* The <code>storedFieldsIndexFormat</code> parameter allows you to choose
* bettwen several fields index formats that offer various trade-offs between
* memory usage and speed.
*
* @param compressionMode the {@link CompressionMode} to use
* @param chunkSize the minimum number of bytes of a single chunk of stored documents
* @param storedFieldsIndexFormat the format to use to load the fields index
* @see CompressionMode
* @see CompressingStoredFieldsIndex
*/
public CompressingStoredFieldsFormat(CompressionMode compressionMode, int chunkSize,
CompressingStoredFieldsIndex storedFieldsIndexFormat) {
this.compressionMode = compressionMode;
if (chunkSize < 1) {
throw new IllegalArgumentException("chunkSize must be >= 1");
}
this.chunkSize = chunkSize;
this.storedFieldsIndexFormat = storedFieldsIndexFormat;
}
/**
* Create a new {@link CompressingStoredFieldsFormat} with an in-memory
* {@link CompressingStoredFieldsIndex}.
*
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int, CompressingStoredFieldsIndex)
*/
public CompressingStoredFieldsFormat(CompressionMode compressionMode, int chunkSize) {
this (compressionMode, chunkSize, chunkSize == 1
? CompressingStoredFieldsIndex.MEMORY_DOC
: CompressingStoredFieldsIndex.MEMORY_CHUNK);
}
/**
* Create a new {@link CompressingStoredFieldsFormat} with
* {@link CompressionMode#FAST} compression and chunks of <tt>16 KB</tt>.
*
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int)
*/
public CompressingStoredFieldsFormat() {
this(CompressionMode.FAST, 1 << 14);
}
@Override
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si,
FieldInfos fn, IOContext context) throws IOException {
return new CompressingStoredFieldsReader(directory, si, fn, context);
}
@Override
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
IOContext context) throws IOException {
return new CompressingStoredFieldsWriter(directory, si, context,
compressionMode, chunkSize, storedFieldsIndexFormat);
}
}

View File

@ -0,0 +1,411 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
/**
* A format for the stored fields index file (.fdx).
* <p>
* These formats allow different memory/speed trade-offs to locate documents
* into the fields data file (.fdt).
* @lucene.experimental
*/
public enum CompressingStoredFieldsIndex {
/**
* This format stores the document index on disk using 64-bits pointers to
* the start offsets of chunks in the fields data file.
* <p>
* This format has no memory overhead and requires at most 1 disk seek to
* locate a document in the fields data file. Use this format in
* memory-constrained environments.
*/
DISK_DOC(0) {
@Override
Writer newWriter(IndexOutput out) {
return new DiskDocFieldsIndexWriter(out);
}
@Override
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
return new DiskDocFieldsIndexReader(in, si);
}
},
/**
* For every document in the segment, this format stores the offset of the
* compressed chunk that contains it in the fields data file.
* <p>
* This fields index format requires at most <code>8 * numDocs</code> bytes
* of memory. Locating a document in the fields data file requires no disk
* seek. Use this format when blocks are very likely to contain few
* documents (in particular when <code>chunkSize = 1</code>).
*/
MEMORY_DOC(1) {
@Override
Writer newWriter(IndexOutput out) throws IOException {
return new ChunksFieldsIndexWriter(out);
}
@Override
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
return new MemoryDocFieldsIndexReader(in, si);
}
},
/**
* For every chunk of compressed documents, this format stores the first doc
* ID of the chunk as well as the start offset of the chunk.
* <p>
* This fields index format require at most
* <code>12 * numChunks</code> bytes of memory. Locating a document in the
* fields data file requires no disk seek. Use this format when chunks are
* likely to contain several documents.
*/
MEMORY_CHUNK(2) {
@Override
Writer newWriter(IndexOutput out) throws IOException {
return new ChunksFieldsIndexWriter(out);
}
@Override
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
return new MemoryChunkFieldsIndexReader(in, si);
}
};
/**
* Retrieve a {@link CompressingStoredFieldsIndex} according to its
* <code>ID</code>.
*/
public static CompressingStoredFieldsIndex byId(int id) {
for (CompressingStoredFieldsIndex idx : CompressingStoredFieldsIndex.values()) {
if (idx.getId() == id) {
return idx;
}
}
throw new IllegalArgumentException("Unknown id: " + id);
}
private final int id;
private CompressingStoredFieldsIndex(int id) {
this.id = id;
}
/**
* Returns an ID for this compression mode. Should be unique across
* {@link CompressionMode}s as it is used for serialization and
* unserialization.
*/
public final int getId() {
return id;
}
abstract Writer newWriter(IndexOutput out) throws IOException;
abstract Reader newReader(IndexInput in, SegmentInfo si) throws IOException;
static abstract class Writer implements Closeable {
protected final IndexOutput fieldsIndexOut;
Writer(IndexOutput indexOutput) {
this.fieldsIndexOut = indexOutput;
}
/** Write the index file for a chunk of <code>numDocs</code> docs starting
* at offset <code>startPointer</code>. */
abstract void writeIndex(int numDocs, long startPointer) throws IOException;
/** Finish writing an index file of <code>numDocs</code> documents. */
abstract void finish(int numDocs) throws IOException;
@Override
public void close() throws IOException {
fieldsIndexOut.close();
}
}
private static class DiskDocFieldsIndexWriter extends Writer {
final long startOffset;
DiskDocFieldsIndexWriter(IndexOutput fieldsIndexOut) {
super(fieldsIndexOut);
startOffset = fieldsIndexOut.getFilePointer();
}
@Override
void writeIndex(int numDocs, long startPointer) throws IOException {
for (int i = 0; i < numDocs; ++i) {
fieldsIndexOut.writeLong(startPointer);
}
}
@Override
void finish(int numDocs) throws IOException {
if (startOffset + ((long) numDocs) * 8 != fieldsIndexOut.getFilePointer()) {
// see Lucene40StoredFieldsWriter#finish
throw new RuntimeException((fieldsIndexOut.getFilePointer() - startOffset)/8 + " fdx size mismatch: docCount is " + numDocs + " but fdx file size is " + fieldsIndexOut.getFilePointer() + " file=" + fieldsIndexOut.toString() + "; now aborting this merge to prevent index corruption");
}
}
}
private static class ChunksFieldsIndexWriter extends Writer {
int numChunks;
long maxStartPointer;
GrowableWriter docBaseDeltas;
GrowableWriter startPointerDeltas;
ChunksFieldsIndexWriter(IndexOutput indexOutput) {
super(indexOutput);
numChunks = 0;
maxStartPointer = 0;
docBaseDeltas = new GrowableWriter(2, 128, PackedInts.COMPACT);
startPointerDeltas = new GrowableWriter(5, 128, PackedInts.COMPACT);
}
@Override
void writeIndex(int numDocs, long startPointer) throws IOException {
if (numChunks == docBaseDeltas.size()) {
final int newSize = ArrayUtil.oversize(numChunks + 1, 1);
docBaseDeltas = docBaseDeltas.resize(newSize);
startPointerDeltas = startPointerDeltas.resize(newSize);
}
docBaseDeltas.set(numChunks, numDocs);
startPointerDeltas.set(numChunks, startPointer - maxStartPointer);
++numChunks;
maxStartPointer = startPointer;
}
@Override
void finish(int numDocs) throws IOException {
if (numChunks != docBaseDeltas.size()) {
docBaseDeltas = docBaseDeltas.resize(numChunks);
startPointerDeltas = startPointerDeltas.resize(numChunks);
}
fieldsIndexOut.writeVInt(numChunks);
fieldsIndexOut.writeByte((byte) PackedInts.bitsRequired(maxStartPointer));
docBaseDeltas.save(fieldsIndexOut);
startPointerDeltas.save(fieldsIndexOut);
}
}
static abstract class Reader implements Cloneable, Closeable {
protected final IndexInput fieldsIndexIn;
Reader(IndexInput fieldsIndexIn) {
this.fieldsIndexIn = fieldsIndexIn;
}
/** Get the start pointer of the compressed block that contains docID */
abstract long getStartPointer(int docID) throws IOException;
public void close() throws IOException {
if (fieldsIndexIn != null) {
fieldsIndexIn.close();
}
}
public abstract Reader clone();
}
private static class DiskDocFieldsIndexReader extends Reader {
final long startPointer;
DiskDocFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws CorruptIndexException {
this(fieldsIndexIn, fieldsIndexIn.getFilePointer());
final long indexSize = fieldsIndexIn.length() - fieldsIndexIn.getFilePointer();
final int numDocs = (int) (indexSize >> 3);
// Verify two sources of "maxDoc" agree:
if (numDocs != si.getDocCount()) {
throw new CorruptIndexException("doc counts differ for segment " + si + ": fieldsReader shows " + numDocs + " but segmentInfo shows " + si.getDocCount());
}
}
private DiskDocFieldsIndexReader(IndexInput fieldsIndexIn, long startPointer) {
super(fieldsIndexIn);
this.startPointer = startPointer;
}
@Override
long getStartPointer(int docID) throws IOException {
fieldsIndexIn.seek(startPointer + docID * 8L);
return fieldsIndexIn.readLong();
}
@Override
public Reader clone() {
return new DiskDocFieldsIndexReader(fieldsIndexIn.clone(), startPointer);
}
}
private static class MemoryDocFieldsIndexReader extends Reader {
private final PackedInts.Reader startPointers;
MemoryDocFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
super(fieldsIndexIn);
final int numChunks = fieldsIndexIn.readVInt();
final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
if (bitsPerStartPointer > 64) {
throw new CorruptIndexException("Corrupted");
}
final PackedInts.Reader chunkDocs = PackedInts.getReader(fieldsIndexIn);
if (chunkDocs.size() != numChunks) {
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + chunkDocs.size());
}
final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
if (startPointerDeltas.size() != numChunks) {
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
}
final PackedInts.Mutable startPointers = PackedInts.getMutable(si.getDocCount(), bitsPerStartPointer, PackedInts.COMPACT);
int docID = 0;
long startPointer = 0;
for (int i = 0; i < numChunks; ++i) {
startPointer += startPointerDeltas.next();
final int chunkDocCount = (int) chunkDocs.get(i);
for (int j = 0; j < chunkDocCount; ++j) {
startPointers.set(docID++, startPointer);
}
}
if (docID != si.getDocCount()) {
throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docID);
}
this.startPointers = startPointers;
}
private MemoryDocFieldsIndexReader(PackedInts.Reader startPointers) {
super(null);
this.startPointers = startPointers;
}
@Override
long getStartPointer(int docID) throws IOException {
return startPointers.get(docID);
}
@Override
public Reader clone() {
if (fieldsIndexIn == null) {
return this;
} else {
return new MemoryDocFieldsIndexReader(startPointers);
}
}
}
private static class MemoryChunkFieldsIndexReader extends Reader {
private final PackedInts.Reader docBases;
private final PackedInts.Reader startPointers;
MemoryChunkFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
super(fieldsIndexIn);
final int numChunks = fieldsIndexIn.readVInt();
final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
if (bitsPerStartPointer > 64) {
throw new CorruptIndexException("Corrupted");
}
final PackedInts.ReaderIterator docBaseDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
if (docBaseDeltas.size() != numChunks) {
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + docBaseDeltas.size());
}
final PackedInts.Mutable docBases = PackedInts.getMutable(numChunks, PackedInts.bitsRequired(Math.max(0, si.getDocCount() - 1)), PackedInts.COMPACT);
int docBase = 0;
for (int i = 0; i < numChunks; ++i) {
docBases.set(i, docBase);
docBase += docBaseDeltas.next();
}
if (docBase != si.getDocCount()) {
throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docBase);
}
final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
if (startPointerDeltas.size() != numChunks) {
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
}
final PackedInts.Mutable startPointers = PackedInts.getMutable(numChunks, bitsPerStartPointer, PackedInts.COMPACT);
int startPointer = 0;
for (int i = 0; i < numChunks; ++i) {
startPointer += startPointerDeltas.next();
startPointers.set(i, startPointer);
}
this.docBases = docBases;
this.startPointers = startPointers;
}
private MemoryChunkFieldsIndexReader(PackedInts.Reader docBases, PackedInts.Reader startPointers) {
super(null);
this.docBases = docBases;
this.startPointers = startPointers;
}
@Override
long getStartPointer(int docID) {
assert docBases.size() > 0;
int lo = 0, hi = docBases.size() - 1;
while (lo <= hi) {
final int mid = (lo + hi) >>> 1;
final long midValue = docBases.get(mid);
if (midValue == docID) {
return startPointers.get(mid);
} else if (midValue < docID) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
return startPointers.get(hi);
}
@Override
public Reader clone() {
if (fieldsIndexIn == null) {
return this;
} else {
return new MemoryChunkFieldsIndexReader(docBases, startPointers);
}
}
}
}

View File

@ -0,0 +1,341 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_NAME_DAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.*;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_DAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_IDX;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
final class CompressingStoredFieldsReader extends StoredFieldsReader {
private final FieldInfos fieldInfos;
private final CompressingStoredFieldsIndex.Reader indexReader;
private final IndexInput fieldsStream;
private final int packedIntsVersion;
private final CompressionMode compressionMode;
private final Uncompressor uncompressor;
private final BytesRef bytes;
private final int numDocs;
private boolean closed;
// used by clone
private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.fieldsStream = reader.fieldsStream.clone();
this.indexReader = reader.indexReader.clone();
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
this.uncompressor = reader.uncompressor.clone();
this.numDocs = reader.numDocs;
this.bytes = new BytesRef(reader.bytes.bytes.length);
this.closed = false;
}
public CompressingStoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
final String segment = si.name;
boolean success = false;
fieldInfos = fn;
numDocs = si.getDocCount();
IndexInput indexStream = null;
try {
fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION);
indexStream = d.openInput(indexStreamFN, context);
CodecUtil.checkHeader(indexStream, CODEC_NAME_IDX, VERSION_START, VERSION_CURRENT);
CodecUtil.checkHeader(fieldsStream, CODEC_NAME_DAT, VERSION_START, VERSION_CURRENT);
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
final int storedFieldsIndexId = indexStream.readVInt();
final CompressingStoredFieldsIndex storedFieldsIndex = CompressingStoredFieldsIndex.byId(storedFieldsIndexId);
indexReader = storedFieldsIndex.newReader(indexStream, si);
indexStream = null;
packedIntsVersion = fieldsStream.readVInt();
final int compressionModeId = fieldsStream.readVInt();
compressionMode = CompressionMode.byId(compressionModeId);
uncompressor = compressionMode.newUncompressor();
this.bytes = new BytesRef();
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this, indexStream);
}
}
}
/**
* @throws AlreadyClosedException if this FieldsReader is closed
*/
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
}
}
@Override
public void close() throws IOException {
if (!closed) {
IOUtils.close(fieldsStream, indexReader);
closed = true;
}
}
private static void readField(ByteArrayDataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException {
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
byte[] data = new byte[length];
in.readBytes(data, 0, length);
visitor.binaryField(info, data);
break;
case STRING:
length = in.readVInt();
data = new byte[length];
in.readBytes(data, 0, length);
visitor.stringField(info, new String(data, IOUtils.CHARSET_UTF_8));
break;
case NUMERIC_INT:
visitor.intField(info, in.readInt());
break;
case NUMERIC_FLOAT:
visitor.floatField(info, Float.intBitsToFloat(in.readInt()));
break;
case NUMERIC_LONG:
visitor.longField(info, in.readLong());
break;
case NUMERIC_DOUBLE:
visitor.doubleField(info, Double.longBitsToDouble(in.readLong()));
break;
default:
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
}
}
private static void skipField(ByteArrayDataInput in, int bits) throws IOException {
switch (bits & TYPE_MASK) {
case BYTE_ARR:
case STRING:
final int length = in.readVInt();
in.skipBytes(length);
break;
case NUMERIC_INT:
case NUMERIC_FLOAT:
in.readInt();
break;
case NUMERIC_LONG:
case NUMERIC_DOUBLE:
in.readLong();
break;
default:
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
}
}
@Override
public void visitDocument(int docID, StoredFieldVisitor visitor)
throws IOException {
fieldsStream.seek(indexReader.getStartPointer(docID));
final int docBase = fieldsStream.readVInt();
final int chunkDocs = fieldsStream.readVInt();
final int bitsPerValue = fieldsStream.readVInt();
if (docID < docBase
|| docID >= docBase + chunkDocs
|| docBase + chunkDocs > numDocs
|| bitsPerValue > 31) {
throw new CorruptIndexException("Corrupted: docID=" + docID
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs
+ ", numDocs=" + numDocs + ", bitsPerValue=" + bitsPerValue);
}
final long filePointer = fieldsStream.getFilePointer();
final PackedInts.ReaderIterator lengths = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 1);
int offset = 0;
for (int i = docBase; i < docID; ++i) {
offset += lengths.next();
}
final int length = (int) lengths.next();
// skip the last values
fieldsStream.seek(filePointer + (PackedInts.Format.PACKED.nblocks(bitsPerValue, chunkDocs) << 3));
uncompressor.uncompress(fieldsStream, offset, length, bytes);
final ByteArrayDataInput documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
final int numFields = documentInput.readVInt();
for (int fieldIDX = 0; fieldIDX < numFields; fieldIDX++) {
final long infoAndBits = documentInput.readVLong();
final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
final int bits = (int) (infoAndBits & TYPE_MASK);
assert bits <= NUMERIC_DOUBLE: "bits=" + Integer.toHexString(bits);
switch(visitor.needsField(fieldInfo)) {
case YES:
readField(documentInput, visitor, fieldInfo, bits);
break;
case NO:
skipField(documentInput, bits);
break;
case STOP:
return;
}
}
}
@Override
public StoredFieldsReader clone() {
ensureOpen();
return new CompressingStoredFieldsReader(this);
}
CompressionMode getCompressionMode() {
return compressionMode;
}
ChunkIterator chunkIterator(int startDocID) throws IOException {
ensureOpen();
fieldsStream.seek(indexReader.getStartPointer(startDocID));
return new ChunkIterator();
}
final class ChunkIterator {
BytesRef bytes;
int docBase;
int chunkDocs;
int[] lengths;
private ChunkIterator() {
this.docBase = -1;
bytes = new BytesRef();
lengths = new int[0];
}
private int readHeader() throws IOException {
final int docBase = fieldsStream.readVInt();
final int chunkDocs = fieldsStream.readVInt();
final int bitsPerValue = fieldsStream.readVInt();
if (docBase < this.docBase + this.chunkDocs
|| docBase + chunkDocs > numDocs
|| bitsPerValue > 31) {
throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
+ ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
+ ", new numDocs=" + chunkDocs + ", bitsPerValue=" + bitsPerValue);
}
this.docBase = docBase;
this.chunkDocs = chunkDocs;
return bitsPerValue;
}
/**
* Return the uncompressed size of the chunk
*/
int chunkSize() {
int sum = 0;
for (int i = 0; i < chunkDocs; ++i) {
sum += lengths[i];
}
return sum;
}
/**
* Go to the chunk containing the provided doc ID.
*/
void next(int doc) throws IOException {
assert doc >= docBase + chunkDocs : doc + " " + docBase + " " + chunkDocs;
// try next chunk
int bitsPerValue = readHeader();
if (docBase + chunkDocs <= doc) {
// doc is not in the next chunk, use seek to skip to the next document chunk
fieldsStream.seek(indexReader.getStartPointer(doc));
bitsPerValue = readHeader();
}
if (doc < docBase
|| doc >= docBase + chunkDocs) {
throw new CorruptIndexException("Corrupted: docID=" + doc
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs);
}
// decode lengths
if (lengths.length < chunkDocs) {
lengths = new int[ArrayUtil.oversize(chunkDocs, 4)];
}
final PackedInts.ReaderIterator iterator = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 0);
for (int i = 0; i < chunkDocs; ++i) {
lengths[i] = (int) iterator.next();
}
}
/**
* Uncompress the chunk.
*/
void uncompress() throws IOException {
// uncompress data
uncompressor.uncompress(fieldsStream, bytes);
if (bytes.length != chunkSize()) {
throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length);
}
}
/**
* Copy compressed data.
*/
void copyCompressedData(DataOutput out) throws IOException {
uncompressor.copyCompressedData(fieldsStream, out);
}
}
}

View File

@ -0,0 +1,391 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.ChunkIterator;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
static final int STRING = 0x00;
static final int BYTE_ARR = 0x01;
static final int NUMERIC_INT = 0x02;
static final int NUMERIC_FLOAT = 0x03;
static final int NUMERIC_LONG = 0x04;
static final int NUMERIC_DOUBLE = 0x05;
static final int TYPE_BITS = PackedInts.bitsRequired(NUMERIC_DOUBLE);
static final int TYPE_MASK = (int) PackedInts.maxValue(TYPE_BITS);
static final String CODEC_NAME_IDX = "CompressingStoredFieldsIndex";
static final String CODEC_NAME_DAT = "CompressingStoredFieldsData";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final long HEADER_LENGTH_IDX = CodecUtil.headerLength(CODEC_NAME_IDX);
static final long HEADER_LENGTH_DAT = CodecUtil.headerLength(CODEC_NAME_DAT);
private final Directory directory;
private final String segment;
private CompressingStoredFieldsIndex.Writer indexWriter;
private IndexOutput fieldsStream;
private final CompressionMode compressionMode;
private final Compressor compressor;
private final int chunkSize;
private final GrowableByteArrayDataOutput bufferedDocs;
private int[] endOffsets; // end offsets in bufferedDocs
private int docBase; // doc ID at the beginning of the chunk
private int numBufferedDocs; // docBase + numBufferedDocs == current doc ID
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si,
IOContext context, CompressionMode compressionMode, int chunkSize, CompressingStoredFieldsIndex storedFieldsIndex) throws IOException {
assert directory != null;
this.directory = directory;
this.segment = si.name;
this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize;
this.docBase = 0;
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
this.endOffsets = new int[16];
this.numBufferedDocs = 0;
boolean success = false;
IndexOutput indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context);
try {
fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
CodecUtil.writeHeader(indexStream, CODEC_NAME_IDX, VERSION_CURRENT);
CodecUtil.writeHeader(fieldsStream, CODEC_NAME_DAT, VERSION_CURRENT);
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
indexStream.writeVInt(storedFieldsIndex.getId());
indexWriter = storedFieldsIndex.newWriter(indexStream);
indexStream = null;
fieldsStream.writeVInt(PackedInts.VERSION_CURRENT);
fieldsStream.writeVInt(compressionMode.getId());
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(indexStream);
abort();
}
}
}
@Override
public void close() throws IOException {
try {
IOUtils.close(fieldsStream, indexWriter);
} finally {
fieldsStream = null;
indexWriter = null;
}
}
private void endWithPreviousDocument() throws IOException {
if (numBufferedDocs > 0) {
assert bufferedDocs.length > 0;
if (numBufferedDocs == endOffsets.length) {
endOffsets = ArrayUtil.grow(endOffsets);
}
endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
}
}
private void addRawDocument(byte[] buf, int off, int len) throws IOException {
endWithPreviousDocument();
if (bufferedDocs.length >= chunkSize) {
flush();
}
bufferedDocs.writeBytes(buf, off, len);
++numBufferedDocs;
}
@Override
public void startDocument(int numStoredFields) throws IOException {
endWithPreviousDocument();
if (bufferedDocs.length >= chunkSize) {
flush();
}
bufferedDocs.writeVInt(numStoredFields);
++numBufferedDocs;
}
private void writeHeader(int docBase, int numBufferedDocs, int[] lengths) throws IOException {
// save docBase and numBufferedDocs
fieldsStream.writeVInt(docBase);
fieldsStream.writeVInt(numBufferedDocs);
// save lengths
final int bitsRequired = bitsRequired(lengths, numBufferedDocs);
assert bitsRequired <= 31;
fieldsStream.writeVInt(bitsRequired);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsStream, PackedInts.Format.PACKED, numBufferedDocs, bitsRequired, 1);
for (int i = 0; i < numBufferedDocs; ++i) {
assert lengths[i] > 0;
writer.add(lengths[i]);
}
assert writer.ord() + 1 == numBufferedDocs;
writer.finish();
}
private void flush() throws IOException {
indexWriter.writeIndex(numBufferedDocs, fieldsStream.getFilePointer());
// transform end offsets into lengths
final int[] lengths = endOffsets;
for (int i = numBufferedDocs - 1; i > 0; --i) {
lengths[i] = endOffsets[i] - endOffsets[i - 1];
assert lengths[i] > 0;
}
writeHeader(docBase, numBufferedDocs, lengths);
// compress stored fields to fieldsStream
compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
// reset
docBase += numBufferedDocs;
numBufferedDocs = 0;
bufferedDocs.length = 0;
}
private static int bitsRequired(int[] data, int length) {
int or = data[0];
for (int i = 1; i < length; ++i) {
or |= data[i];
}
return PackedInts.bitsRequired(or);
}
@Override
public void writeField(FieldInfo info, StorableField field)
throws IOException {
int bits = 0;
final BytesRef bytes;
final String string;
Number number = field.numericValue();
if (number != null) {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bits = NUMERIC_INT;
} else if (number instanceof Long) {
bits = NUMERIC_LONG;
} else if (number instanceof Float) {
bits = NUMERIC_FLOAT;
} else if (number instanceof Double) {
bits = NUMERIC_DOUBLE;
} else {
throw new IllegalArgumentException("cannot store numeric type " + number.getClass());
}
string = null;
bytes = null;
} else {
bytes = field.binaryValue();
if (bytes != null) {
bits = BYTE_ARR;
string = null;
} else {
bits = STRING;
string = field.stringValue();
if (string == null) {
throw new IllegalArgumentException("field " + field.name() + " is stored but does not have binaryValue, stringValue nor numericValue");
}
}
}
final long infoAndBits = (((long) info.number) << TYPE_BITS) | bits;
bufferedDocs.writeVLong(infoAndBits);
if (bytes != null) {
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
bufferedDocs.writeString(field.stringValue());
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeInt(number.intValue());
} else if (number instanceof Long) {
bufferedDocs.writeLong(number.longValue());
} else if (number instanceof Float) {
bufferedDocs.writeInt(Float.floatToIntBits(number.floatValue()));
} else if (number instanceof Double) {
bufferedDocs.writeLong(Double.doubleToLongBits(number.doubleValue()));
} else {
throw new AssertionError("Cannot get here");
}
}
}
@Override
public void abort() {
IOUtils.closeWhileHandlingException(this);
IOUtils.deleteFilesIgnoringExceptions(directory,
IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION),
IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION));
}
@Override
public void finish(FieldInfos fis, int numDocs) throws IOException {
endWithPreviousDocument();
if (numBufferedDocs > 0) {
flush();
}
if (docBase != numDocs) {
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
}
indexWriter.finish(numDocs);
assert bufferedDocs.length == 0;
}
@Override
public int merge(MergeState mergeState) throws IOException {
int docCount = 0;
int idx = 0;
for (AtomicReader reader : mergeState.readers) {
final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
CompressingStoredFieldsReader matchingFieldsReader = null;
if (matchingSegmentReader != null) {
final StoredFieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
// we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
}
}
final int maxDoc = reader.maxDoc();
final Bits liveDocs = reader.getLiveDocs();
if (matchingFieldsReader == null) {
// naive merge...
for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
StoredDocument doc = reader.document(i);
addDocument(doc, mergeState.fieldInfos);
++docCount;
mergeState.checkAbort.work(300);
}
} else {
int docID = nextLiveDoc(0, liveDocs, maxDoc);
if (docID < maxDoc) {
// not all docs were deleted
final ChunkIterator it = matchingFieldsReader.chunkIterator(docID);
int[] startOffsets = new int[0];
do {
// go to the next chunk that contains docID
it.next(docID);
// transform lengths into offsets
if (startOffsets.length < it.chunkDocs) {
startOffsets = new int[ArrayUtil.oversize(it.chunkDocs, 4)];
}
for (int i = 1; i < it.chunkDocs; ++i) {
startOffsets[i] = startOffsets[i - 1] + it.lengths[i - 1];
}
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
&& (numBufferedDocs == 0 || bufferedDocs.length >= chunkSize) // starting a new chunk
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
assert docID == it.docBase;
// no need to uncompress, just copy data
endWithPreviousDocument();
if (bufferedDocs.length >= chunkSize) {
flush();
}
indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
writeHeader(this.docBase, it.chunkDocs, it.lengths);
it.copyCompressedData(fieldsStream);
this.docBase += it.chunkDocs;
docID = nextLiveDoc(it.docBase + it.chunkDocs, liveDocs, maxDoc);
docCount += it.chunkDocs;
mergeState.checkAbort.work(300 * it.chunkDocs);
} else {
// uncompress
it.uncompress();
if (startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] != it.bytes.length) {
throw new CorruptIndexException("Corrupted: expected chunk size=" + startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] + ", got " + it.bytes.length);
}
// copy non-deleted docs
for (; docID < it.docBase + it.chunkDocs; docID = nextLiveDoc(docID + 1, liveDocs, maxDoc)) {
final int diff = docID - it.docBase;
addRawDocument(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
++docCount;
mergeState.checkAbort.work(300);
}
}
} while (docID < maxDoc);
}
}
}
finish(mergeState.fieldInfos, docCount);
return docCount;
}
private static int nextLiveDoc(int doc, Bits liveDocs, int maxDoc) {
if (liveDocs == null) {
return doc;
}
while (doc < maxDoc && !liveDocs.get(doc)) {
++doc;
}
return doc;
}
private static int nextDeletedDoc(int doc, Bits liveDocs, int maxDoc) {
if (liveDocs == null) {
return maxDoc;
}
while (doc < maxDoc && liveDocs.get(doc)) {
++doc;
}
return doc;
}
}

View File

@ -0,0 +1,343 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
/**
* A compression mode. Tells how much effort should be spent on compression and
* uncompression of stored fields.
* @lucene.experimental
*/
public enum CompressionMode {
/**
* A compression mode that trades compression ratio for speed. Although the
* compression ratio might remain high, compression and uncompression are
* very fast. Use this mode with indices that have a high update rate but
* should be able to load documents from disk quickly.
*/
FAST(0) {
@Override
Compressor newCompressor() {
return LZ4_FAST_COMPRESSOR;
}
@Override
Uncompressor newUncompressor() {
return LZ4_UNCOMPRESSOR;
}
},
/**
* A compression mode that trades speed for compression ratio. Although
* compression and uncompression might be slow, this compression mode should
* provide a good compression ratio. This mode might be interesting if/when
* your index size is much bigger than your OS cache.
*/
HIGH_COMPRESSION(1) {
@Override
Compressor newCompressor() {
return new DeflateCompressor(Deflater.BEST_COMPRESSION);
}
@Override
Uncompressor newUncompressor() {
return new DeflateUncompressor();
}
},
/**
* This compression mode is similar to {@link #FAST} but it spends more time
* compressing in order to improve the compression ratio. This compression
* mode is best used with indices that have a low update rate but should be
* able to load documents from disk quickly.
*/
FAST_UNCOMPRESSION(2) {
@Override
Compressor newCompressor() {
return LZ4_HIGH_COMPRESSOR;
}
@Override
Uncompressor newUncompressor() {
return LZ4_UNCOMPRESSOR;
}
};
public static CompressionMode byId(int id) {
for (CompressionMode mode : CompressionMode.values()) {
if (mode.getId() == id) {
return mode;
}
}
throw new IllegalArgumentException("Unknown id: " + id);
}
private final int id;
private CompressionMode(int id) {
this.id = id;
}
/**
* Returns an ID for this compression mode. Should be unique across
* {@link CompressionMode}s as it is used for serialization and
* unserialization.
*/
public final int getId() {
return id;
}
/**
* Create a new {@link Compressor} instance.
*/
abstract Compressor newCompressor();
/**
* Create a new {@link Uncompressor} instance.
*/
abstract Uncompressor newUncompressor();
private static final Uncompressor LZ4_UNCOMPRESSOR = new Uncompressor() {
@Override
public void uncompress(DataInput in, BytesRef bytes) throws IOException {
final int uncompressedLen = in.readVInt();
if (bytes.bytes.length < uncompressedLen + 8) {
bytes.bytes = ArrayUtil.grow(bytes.bytes, uncompressedLen + 8);
}
LZ4.uncompress(in, uncompressedLen, bytes);
if (bytes.length != uncompressedLen) {
throw new IOException("Corrupted");
}
}
@Override
public void uncompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
final int uncompressedLen = in.readVInt();
if (offset > uncompressedLen) {
bytes.length = 0;
return;
}
if (bytes.bytes.length < uncompressedLen) {
bytes.bytes = ArrayUtil.grow(bytes.bytes, uncompressedLen);
}
LZ4.uncompress(in, offset + length, bytes);
bytes.offset = offset;
if (offset + length >= uncompressedLen) {
if (bytes.length != uncompressedLen) {
throw new IOException("Corrupted");
}
bytes.length = uncompressedLen - offset;
} else {
bytes.length = length;
}
}
public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
final int uncompressedLen = in.readVInt();
out.writeVInt(uncompressedLen);
if (uncompressedLen == 0) {
out.writeByte((byte) 0); // the token
return;
}
int n = 0;
while (n < uncompressedLen) {
// literals
final byte token = in.readByte();
out.writeByte(token);
int literalLen = (token & 0xFF) >>> 4;
if (literalLen == 0x0F) {
byte len;
while ((len = in.readByte()) == (byte) 0xFF) {
literalLen += 0xFF;
out.writeByte(len);
}
literalLen += len & 0xFF;
out.writeByte(len);
}
out.copyBytes(in, literalLen);
n += literalLen;
if (n >= uncompressedLen) {
break;
}
// matchs
out.copyBytes(in, 2); // match dec
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
byte len;
while ((len = in.readByte()) == (byte) 0xFF) {
matchLen += 0xFF;
out.writeByte(len);
}
matchLen += len & 0xFF;
out.writeByte(len);
}
matchLen += LZ4.MIN_MATCH;
n += matchLen;
}
if (n != uncompressedLen) {
throw new IOException("Currupted compressed stream: expected " + uncompressedLen + " bytes, but got at least" + n);
}
}
@Override
public Uncompressor clone() {
return this;
}
};
private static final Compressor LZ4_FAST_COMPRESSOR = new Compressor() {
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out)
throws IOException {
out.writeVInt(len);
LZ4.compress(bytes, off, len, out);
}
};
private static final Compressor LZ4_HIGH_COMPRESSOR = new Compressor() {
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out)
throws IOException {
out.writeVInt(len);
LZ4.compressHC(bytes, off, len, out);
}
};
private static final class DeflateUncompressor extends Uncompressor {
final Inflater uncompressor;
byte[] compressed;
DeflateUncompressor() {
uncompressor = new Inflater();
compressed = new byte[0];
}
@Override
public void uncompress(DataInput in, BytesRef bytes) throws IOException {
bytes.offset = bytes.length = 0;
final int compressedLength = in.readVInt();
if (compressedLength > compressed.length) {
compressed = ArrayUtil.grow(compressed, compressedLength);
}
in.readBytes(compressed, 0, compressedLength);
uncompressor.reset();
uncompressor.setInput(compressed, 0, compressedLength);
if (uncompressor.needsInput()) {
return;
}
while (true) {
final int count;
try {
final int remaining = bytes.bytes.length - bytes.length;
count = uncompressor.inflate(bytes.bytes, bytes.length, remaining);
} catch (DataFormatException e) {
throw new IOException(e);
}
bytes.length += count;
if (uncompressor.finished()) {
break;
} else {
bytes.bytes = ArrayUtil.grow(bytes.bytes);
}
}
}
@Override
public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
final int compressedLength = in.readVInt();
out.writeVInt(compressedLength);
out.copyBytes(in, compressedLength);
}
@Override
public Uncompressor clone() {
return new DeflateUncompressor();
}
}
private static class DeflateCompressor extends Compressor {
final Deflater compressor;
byte[] compressed;
DeflateCompressor(int level) {
compressor = new Deflater(level);
compressed = new byte[64];
}
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
compressor.reset();
compressor.setInput(bytes, off, len);
compressor.finish();
if (compressor.needsInput()) {
// no output
out.writeVInt(0);
return;
}
int totalCount = 0;
for (;;) {
final int count = compressor.deflate(compressed, totalCount, compressed.length - totalCount);
totalCount += count;
assert totalCount <= compressed.length;
if (compressor.finished()) {
break;
} else {
compressed = ArrayUtil.grow(compressed);
}
}
out.writeVInt(totalCount);
out.writeBytes(compressed, totalCount);
}
}
}

View File

@ -0,0 +1,36 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
/**
* A data compressor.
*/
abstract class Compressor {
/**
* Compress bytes into <code>out</code>. It it the responsibility of the
* compressor to add all necessary information so that a {@link Uncompressor}
* will know when to stop uncompressing bytes from the stream.
*/
public abstract void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException;
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
/**
* A {@link DataOutput} that can be used to build a byte[].
*/
final class GrowableByteArrayDataOutput extends DataOutput {
byte[] bytes;
int length;
GrowableByteArrayDataOutput(int cp) {
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
this.length = 0;
}
@Override
public void writeByte(byte b) throws IOException {
if (length >= bytes.length) {
bytes = ArrayUtil.grow(bytes);
}
bytes[length++] = b;
}
@Override
public void writeBytes(byte[] b, int off, int len) throws IOException {
final int newLength = length + len;
if (newLength > bytes.length) {
bytes = ArrayUtil.grow(bytes, newLength);
}
System.arraycopy(b, off, bytes, length, len);
length = newLength;
}
}

View File

@ -0,0 +1,506 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.PackedInts;
/**
* LZ4 compression and uncompression routines.
*
* http://code.google.com/p/lz4/
* http://fastcompression.blogspot.fr/p/lz4.html
*/
class LZ4 {
private LZ4() {}
static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
static final int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
private static int hash(int i, int hashBits) {
return (i * -1640531535) >>> (32 - hashBits);
}
private static int hashHC(int i) {
return hash(i, HASH_LOG_HC);
}
private static int readInt(byte[] buf, int i) {
return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF);
}
private static boolean readIntEquals(byte[] buf, int i, int j) {
return readInt(buf, i) == readInt(buf, j);
}
private static int commonBytes(byte[] b, int o1, int o2, int limit) {
assert o1 < o2;
int count = 0;
while (o2 < limit && b[o1++] == b[o2++]) {
++count;
}
return count;
}
private static int commonBytesBackward(byte[] b, int o1, int o2, int l1, int l2) {
int count = 0;
while (o1 > l1 && o2 > l2 && b[--o1] == b[--o2]) {
++count;
}
return count;
}
/**
* Uncompress at least <code>uncompressedLen</code> bytes into <code>destBytes</code>.
* Please note that <code>destBytes</code> must be large enough to be able to hold
* <b>all</b> uncompressed data plus 8 bytes (meaning that you need to know the total
* uncompressed length).
*/
public static void uncompress(DataInput compressed, int uncompressedLen, BytesRef destBytes) throws IOException {
final byte[] dest = destBytes.bytes;
final int destEnd = dest.length;
int dOff = 0;
while (dOff < uncompressedLen) {
// literals
final int token = compressed.readByte() & 0xFF;
int literalLen = token >>> 4;
if (literalLen != 0) {
if (literalLen == 0x0F) {
byte len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
literalLen += 0xFF;
}
literalLen += len & 0xFF;
}
compressed.readBytes(dest, dOff, literalLen);
dOff += literalLen;
}
if (dOff >= uncompressedLen) {
break;
}
// matchs
final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8);
assert matchDec > 0;
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
int len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
matchLen += 0xFF;
}
matchLen += len & 0xFF;
}
matchLen += MIN_MATCH;
// copying a multiple of 8 bytes can make uncompression from 5% to 10% faster
final int fastLen = ((matchLen - 1) & 0xFFFFFFF8) + 8;
if (matchDec < matchLen || dOff + fastLen > destEnd) {
// overlap -> naive incremental copy
for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) {
dest[dOff] = dest[ref];
}
} else {
// no overlap -> arraycopy
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
dOff += matchLen;
}
}
destBytes.offset = 0;
destBytes.length = dOff;
}
private static void encodeLen(int l, DataOutput out) throws IOException {
while (l >= 0xFF) {
out.writeByte((byte) 0xFF);
l -= 0xFF;
}
out.writeByte((byte) l);
}
private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException {
out.writeByte((byte) token);
// encode literal length
if (literalLen >= 0x0F) {
encodeLen(literalLen - 0x0F, out);
}
// encode literals
out.writeBytes(bytes, anchor, literalLen);
}
private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException {
final int token = Math.min(literalLen, 0x0F) << 4;
encodeLiterals(bytes, token, anchor, literalLen, out);
}
private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException {
final int literalLen = matchOff - anchor;
assert matchLen >= 4;
// encode token
final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F);
encodeLiterals(bytes, token, anchor, literalLen, out);
// encode match dec
final int matchDec = matchOff - matchRef;
assert matchDec > 0 && matchDec < 1 << 16;
out.writeByte((byte) matchDec);
out.writeByte((byte) (matchDec >>> 8));
// encode match len
if (matchLen >= MIN_MATCH + 0x0F) {
encodeLen(matchLen - 0x0F - MIN_MATCH, out);
}
}
/**
* Compress <code>bytes[off:off+len]</code> into <code>out</code> using
* 2<sup>hashLog</sup> bytes of memory. Higher values of <code>hashLog</code>
* improve the compression ratio.
*/
public static void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
final int base = off;
final int end = off + len;
int anchor = off++;
if (len > LAST_LITERALS + MIN_MATCH) {
final int limit = end - LAST_LITERALS;
final int matchLimit = limit - MIN_MATCH;
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
final int hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
final PackedInts.Mutable hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT);
main:
while (off < limit) {
// find a match
int ref;
while (true) {
if (off >= matchLimit) {
break main;
}
final int v = readInt(bytes, off);
final int h = hash(v, hashLog);
ref = base + (int) hashTable.get(h);
assert PackedInts.bitsRequired(off - base) <= hashTable.getBitsPerValue();
hashTable.set(h, off - base);
if (off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) {
break;
}
++off;
}
// compute match length
final int matchLen = MIN_MATCH + commonBytes(bytes, ref + 4, off + 4, limit);
encodeSequence(bytes, anchor, ref, off, matchLen, out);
off += matchLen;
anchor = off;
}
}
// last literals
final int literalLen = end - anchor;
assert literalLen >= LAST_LITERALS || literalLen == len;
encodeLastLiterals(bytes, anchor, end - anchor, out);
}
private static class Match {
int start, ref, len;
void fix(int correction) {
start += correction;
ref += correction;
len -= correction;
}
int end() {
return start + len;
}
}
private static void copyTo(Match m1, Match m2) {
m2.len = m1.len;
m2.start = m1.start;
m2.ref = m1.ref;
}
private static class HashTable {
static final int MAX_ATTEMPTS = 256;
static final int MASK = MAX_DISTANCE - 1;
int nextToUpdate;
private final int base;
private final int[] hashTable;
private final short[] chainTable;
HashTable(int base) {
this.base = base;
nextToUpdate = base;
hashTable = new int[HASH_TABLE_SIZE_HC];
Arrays.fill(hashTable, -1);
chainTable = new short[MAX_DISTANCE];
}
private int hashPointer(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
return base + hashTable[h];
}
private int next(int off) {
return base + off - (chainTable[off & MASK] & 0xFFFF);
}
private void addHash(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
int delta = off - hashTable[h];
if (delta >= MAX_DISTANCE) {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
hashTable[h] = off - base;
}
void insert(int off, byte[] bytes) {
for (; nextToUpdate < off; ++nextToUpdate) {
addHash(bytes, nextToUpdate);
}
}
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
match.start = off;
match.len = 0;
insert(off, buf);
int ref = hashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
break;
}
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
final int matchLen = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
if (matchLen > match.len) {
match.ref = ref;
match.len = matchLen;
}
}
ref = next(ref);
}
return match.len != 0;
}
boolean insertAndFindWiderMatch(byte[] buf, int off, int startLimit, int matchLimit, int minLen, Match match) {
match.len = minLen;
insert(off, buf);
final int delta = off - startLimit;
int ref = hashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
break;
}
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
&& readIntEquals(buf, ref, off)) {
final int matchLenForward = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
final int matchLenBackward = commonBytesBackward(buf, ref, off, base, startLimit);
final int matchLen = matchLenBackward + matchLenForward;
if (matchLen > match.len) {
match.len = matchLen;
match.ref = ref - matchLenBackward;
match.start = off - matchLenBackward;
}
}
ref = next(ref);
}
return match.len > minLen;
}
}
public static void compressHC(byte[] src, int srcOff, int srcLen, DataOutput out) throws IOException {
final int srcEnd = srcOff + srcLen;
final int matchLimit = srcEnd - LAST_LITERALS;
int sOff = srcOff;
int anchor = sOff++;
final HashTable ht = new HashTable(srcOff);
final Match match0 = new Match();
final Match match1 = new Match();
final Match match2 = new Match();
final Match match3 = new Match();
main:
while (sOff < matchLimit) {
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
++sOff;
continue;
}
// saved, in case we would skip too much
copyTo(match1, match0);
search2:
while (true) {
assert match1.start >= anchor;
if (match1.end() >= matchLimit
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
// no better match
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
continue main;
}
if (match0.start < match1.start) {
if (match2.start < match1.start + match0.len) { // empirical
copyTo(match0, match1);
}
}
assert match2.start > match1.start;
if (match2.start - match1.start < 3) { // First Match too small : removed
copyTo(match2, match1);
continue search2;
}
search3:
while (true) {
if (match2.start - match1.start < OPTIMAL_ML) {
int newMatchLen = match1.len;
if (newMatchLen > OPTIMAL_ML) {
newMatchLen = OPTIMAL_ML;
}
if (match1.start + newMatchLen > match2.end() - MIN_MATCH) {
newMatchLen = match2.start - match1.start + match2.len - MIN_MATCH;
}
final int correction = newMatchLen - (match2.start - match1.start);
if (correction > 0) {
match2.fix(correction);
}
}
if (match2.start + match2.len >= matchLimit
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
// no better match -> 2 sequences to encode
if (match2.start < match1.end()) {
if (match2.start - match1.start < OPTIMAL_ML) {
if (match1.len > OPTIMAL_ML) {
match1.len = OPTIMAL_ML;
}
if (match1.end() > match2.end() - MIN_MATCH) {
match1.len = match2.end() - match1.start - MIN_MATCH;
}
final int correction = match1.len - (match2.start - match1.start);
if (correction > 0) {
match2.fix(correction);
}
} else {
match1.len = match2.start - match1.start;
}
}
// encode seq 1
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
// encode seq 2
encodeSequence(src, anchor, match2.ref, match2.start, match2.len, out);
anchor = sOff = match2.end();
continue main;
}
if (match3.start < match1.end() + 3) { // Not enough space for match 2 : remove it
if (match3.start >= match1.end()) { // // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
if (match2.start < match1.end()) {
final int correction = match1.end() - match2.start;
match2.fix(correction);
if (match2.len < MIN_MATCH) {
copyTo(match3, match2);
}
}
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
copyTo(match3, match1);
copyTo(match2, match0);
continue search2;
}
copyTo(match3, match2);
continue search3;
}
// OK, now we have 3 ascending matches; let's write at least the first one
if (match2.start < match1.end()) {
if (match2.start - match1.start < 0x0F) {
if (match1.len > OPTIMAL_ML) {
match1.len = OPTIMAL_ML;
}
if (match1.end() > match2.end() - MIN_MATCH) {
match1.len = match2.end() - match1.start - MIN_MATCH;
}
final int correction = match1.end() - match2.start;
match2.fix(correction);
} else {
match1.len = match2.start - match1.start;
}
}
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
copyTo(match2, match1);
copyTo(match3, match2);
continue search3;
}
}
}
encodeLastLiterals(src, anchor, srcEnd - anchor, out);
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
/**
* An uncompressor.
*/
abstract class Uncompressor implements Cloneable {
/**
* Uncompress bytes. This method is free to resize <code>bytes</code> in case
* it is too small to hold all the uncompressed data.
*/
public abstract void uncompress(DataInput in, BytesRef bytes) throws IOException;
/**
* Method to use if you are only interested into <code>length</code>
* uncompressed bytes starting at offset <code>offset</code>. Some compression
* codecs might have optimizations for this special case.
*/
public void uncompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
uncompress(in, bytes);
if (bytes.length < offset + length) {
throw new IndexOutOfBoundsException((offset + length) + " > " + bytes.length);
}
bytes.offset += offset;
bytes.length = length;
}
public abstract void copyCompressedData(DataInput in, DataOutput out) throws IOException;
@Override
public abstract Uncompressor clone();
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
StoredFieldsFormat that allows cross-document and cross-field compression of stored fields.
</body>
</html>

View File

@ -0,0 +1,111 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public abstract class AbstractTestCompressionMode extends LuceneTestCase {
CompressionMode mode;
static byte[] randomArray() {
final int max = random().nextBoolean()
? random().nextInt(4)
: random().nextInt(256);
final int length = random().nextBoolean()
? random().nextInt(20)
: random().nextInt(192 * 1024);
final byte[] arr = new byte[length];
for (int i = 0; i < arr.length; ++i) {
arr[i] = (byte) RandomInts.randomIntBetween(random(), 0, max);
}
return arr;
}
byte[] compress(byte[] uncompressed) throws IOException {
Compressor compressor = mode.newCompressor();
return compress(compressor, uncompressed);
}
static byte[] compress(Compressor compressor, byte[] uncompressed) throws IOException {
byte[] compressed = new byte[uncompressed.length * 2 + 16]; // should be enough
ByteArrayDataOutput out = new ByteArrayDataOutput(compressed);
compressor.compress(uncompressed, 0, uncompressed.length, out);
final int compressedLen = out.getPosition();
return Arrays.copyOf(compressed, compressedLen);
}
byte[] uncompress(byte[] compressed) throws IOException {
Uncompressor uncompressor = mode.newUncompressor();
return uncompress(uncompressor, compressed);
}
static byte[] uncompress(Uncompressor uncompressor, byte[] compressed) throws IOException {
final BytesRef bytes = new BytesRef();
uncompressor.uncompress(new ByteArrayDataInput(compressed), bytes);
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
}
byte[] uncompress(byte[] compressed, int offset, int length) throws IOException {
Uncompressor uncompressor = mode.newUncompressor();
final BytesRef bytes = new BytesRef();
uncompressor.uncompress(new ByteArrayDataInput(compressed), offset, length, bytes);
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
}
public void testUncompress() throws IOException {
final byte[] uncompressed = randomArray();
final byte[] compressed = compress(uncompressed);
final byte[] restored = uncompress(compressed);
assertArrayEquals(uncompressed, restored);
}
public void testPartialUncompress() throws IOException {
final int iterations = atLeast(10);
for (int i = 0; i < iterations; ++i) {
final byte[] uncompressed = randomArray();
final byte[] compressed = compress(uncompressed);
final int offset, length;
if (uncompressed.length == 0) {
offset = length = 0;
} else {
offset = random().nextInt(uncompressed.length);
length = random().nextInt(uncompressed.length - offset);
}
final byte[] restored = uncompress(compressed, offset, length);
assertArrayEquals(Arrays.copyOfRange(uncompressed, offset, offset + length), restored);
}
}
public void testCopyCompressedData() throws IOException {
final byte[] uncompressed = randomArray();
final byte[] compressed = compress(uncompressed);
GrowableByteArrayDataOutput out = new GrowableByteArrayDataOutput(uncompressed.length);
mode.newUncompressor().copyCompressedData(new ByteArrayDataInput(compressed), out);
assertArrayEquals(compressed, Arrays.copyOf(out.bytes, out.length));
}
}

View File

@ -0,0 +1,193 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public class TestCompressingStoredFieldsFormat extends LuceneTestCase {
public void testWriteReadMerge() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
iwConf.setCodec(CompressingCodec.randomInstance(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
final int docCount = atLeast(200);
final byte[][][] data = new byte [docCount][][];
for (int i = 0; i < docCount; ++i) {
final int fieldCount = rarely()
? RandomInts.randomIntBetween(random(), 1, 500)
: RandomInts.randomIntBetween(random(), 1, 5);
data[i] = new byte[fieldCount][];
for (int j = 0; j < fieldCount; ++j) {
final int length = rarely()
? random().nextInt(1000)
: random().nextInt(10);
final byte[] arr = new byte[length];
final int max = rarely() ? 256 : 2;
for (int k = 0; k < length; ++k) {
arr[k] = (byte) random().nextInt(max);
}
data[i][j] = arr;
}
}
final FieldType type = new FieldType(StringField.TYPE_STORED);
type.setIndexed(false);
type.freeze();
IntField id = new IntField("id", 0, Store.YES);
for (int i = 0; i < data.length; ++i) {
Document doc = new Document();
doc.add(id);
id.setIntValue(i);
for (int j = 0; j < data[i].length; ++j) {
Field f = new Field("bytes" + j, data[i][j], type);
doc.add(f);
}
iw.w.addDocument(doc);
if (random().nextBoolean() && (i % (data.length / 10) == 0)) {
iw.w.close();
// switch codecs
if (iwConf.getCodec() instanceof Lucene40Codec) {
iwConf.setCodec(CompressingCodec.randomInstance(random()));
} else {
iwConf.setCodec(new Lucene40Codec());
}
iw = new RandomIndexWriter(random(), dir, iwConf);
}
}
for (int i = 0; i < 10; ++i) {
final int min = random().nextInt(data.length);
final int max = min + random().nextInt(20);
iw.deleteDocuments(NumericRangeQuery.newIntRange("id", min, max, true, false));
}
iw.forceMerge(2); // force merges with deletions
iw.commit();
final DirectoryReader ir = DirectoryReader.open(dir);
assertTrue(ir.numDocs() > 0);
int numDocs = 0;
for (int i = 0; i < ir.maxDoc(); ++i) {
final StoredDocument doc = ir.document(i);
if (doc == null) {
continue;
}
++ numDocs;
final int docId = doc.getField("id").numericValue().intValue();
assertEquals(data[docId].length + 1, doc.getFields().size());
for (int j = 0; j < data[docId].length; ++j) {
final byte[] arr = data[docId][j];
final BytesRef arr2Ref = doc.getBinaryValue("bytes" + j);
final byte[] arr2 = Arrays.copyOfRange(arr2Ref.bytes, arr2Ref.offset, arr2Ref.offset + arr2Ref.length);
assertArrayEquals(arr, arr2);
}
}
assertTrue(ir.numDocs() <= numDocs);
ir.close();
iw.deleteAll();
iw.commit();
iw.forceMerge(1);
iw.close();
dir.close();
}
public void testReadSkip() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
iwConf.setCodec(CompressingCodec.randomInstance(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
FieldType ft = new FieldType();
ft.setStored(true);
ft.freeze();
final String string = _TestUtil.randomSimpleString(random(), 50);
final byte[] bytes = string.getBytes("UTF-8");
final long l = random().nextBoolean() ? random().nextInt(42) : random().nextLong();
final int i = random().nextBoolean() ? random().nextInt(42) : random().nextInt();
final float f = random().nextFloat();
final double d = random().nextDouble();
List<Field> fields = Arrays.asList(
new Field("bytes", bytes, ft),
new Field("string", string, ft),
new LongField("long", l, Store.YES),
new IntField("int", i, Store.YES),
new FloatField("float", f, Store.YES),
new DoubleField("double", d, Store.YES)
);
for (int k = 0; k < 100; ++k) {
Document doc = new Document();
for (Field fld : fields) {
doc.add(fld);
}
iw.w.addDocument(doc);
}
iw.close();
final DirectoryReader reader = DirectoryReader.open(dir);
final int docID = random().nextInt(100);
for (Field fld : fields) {
String fldName = fld.name();
final StoredDocument sDoc = reader.document(docID, Collections.singleton(fldName));
final StorableField sField = sDoc.getField(fldName);
if (Field.class.equals(fld.getClass())) {
assertEquals(fld.binaryValue(), sField.binaryValue());
assertEquals(fld.stringValue(), sField.stringValue());
} else {
assertEquals(fld.numericValue(), sField.numericValue());
}
}
reader.close();
dir.close();
}
}

View File

@ -0,0 +1,27 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestFastCompressionMode extends AbstractTestCompressionMode {
public void setUp() throws Exception {
super.setUp();
mode = CompressionMode.FAST;
}
}

View File

@ -0,0 +1,28 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestFastUncompressionMode extends AbstractTestCompressionMode {
public void setUp() throws Exception {
super.setUp();
mode = CompressionMode.FAST_UNCOMPRESSION;
}
}

View File

@ -0,0 +1,27 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestHighCompressionMode extends AbstractTestCompressionMode {
public void setUp() throws Exception {
super.setUp();
mode = CompressionMode.HIGH_COMPRESSION;
}
}

View File

@ -37,7 +37,7 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
this.format = format;
bulkOperation = BulkOperation.of(format, bitsPerValue);
iterations = bulkOperation.computeIterations(valueCount, mem);
assert iterations > 0;
assert valueCount == 0 || iterations > 0;
nextBlocks = new long[iterations * bulkOperation.blockCount()];
nextValues = new LongsRef(new long[iterations * bulkOperation.valueCount()], 0, 0);
assert iterations * bulkOperation.valueCount() == nextValues.longs.length;

View File

@ -0,0 +1,64 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Random;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
/**
* A codec that uses {@link CompressingStoredFieldsFormat} for its stored
* fields and delegates to {@link Lucene40Codec} for everything else.
*/
public class CompressingCodec extends FilterCodec {
/**
* Create a random instance.
*/
public static CompressingCodec randomInstance(Random random) {
final CompressionMode mode = RandomPicks.randomFrom(random, CompressionMode.values());
final int chunkSize = RandomInts.randomIntBetween(random, 1, 500);
final CompressingStoredFieldsIndex index = RandomPicks.randomFrom(random, CompressingStoredFieldsIndex.values());
return new CompressingCodec(mode, chunkSize, index);
}
private final CompressingStoredFieldsFormat storedFieldsFormat;
/**
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int, CompressingStoredFieldsIndex)
*/
public CompressingCodec(CompressionMode compressionMode, int chunkSize,
CompressingStoredFieldsIndex storedFieldsIndexFormat) {
super("Compressing", new Lucene40Codec());
this.storedFieldsFormat = new CompressingStoredFieldsFormat(compressionMode, chunkSize, storedFieldsIndexFormat);
}
public CompressingCodec() {
this(CompressionMode.FAST, 1 << 14, CompressingStoredFieldsIndex.MEMORY_CHUNK);
}
@Override
public StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Support for testing {@link org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat}.
</body>
</html>

View File

@ -31,6 +31,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.appending.AppendingCodec;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
@ -165,6 +166,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
codec = new AppendingCodec();
} else if ("Asserting".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 7 && !shouldAvoidCodec("Asserting"))) {
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {

View File

@ -14,3 +14,4 @@
# limitations under the License.
org.apache.lucene.codecs.asserting.AssertingCodec
org.apache.lucene.codecs.compressing.CompressingCodec