mirror of https://github.com/apache/lucene.git
LUCENE-4226: Efficient stored fields compression.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1394578 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1d521aa6bf
commit
c3e00c353e
|
@ -20,6 +20,10 @@ Changes in backwards compatibility policy
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-4226: New experimental StoredFieldsFormat (in lucene/codecs) that
|
||||
compresses chunks of documents together in order to improve the compression
|
||||
ratio. (Adrien Grand)
|
||||
|
||||
* LUCENE-4426: New ValueSource implementations (in lucene/queries) for
|
||||
DocValues fields. (Adrien Grand)
|
||||
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.MergePolicy;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/**
|
||||
* A {@link StoredFieldsFormat} that is very similar to
|
||||
* {@link Lucene40StoredFieldsFormat} but compresses documents in chunks in
|
||||
* order to improve compression ratio.
|
||||
* <p>
|
||||
* For optimal performance, you should use a {@link MergePolicy} that returns
|
||||
* segments that have the biggest byte size first.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
||||
private final CompressingStoredFieldsIndex storedFieldsIndexFormat;
|
||||
private final CompressionMode compressionMode;
|
||||
private final int chunkSize;
|
||||
|
||||
/**
|
||||
* Create a new {@link CompressingStoredFieldsFormat}.
|
||||
* <p>
|
||||
* The <code>compressionMode</code> parameter allows you to choose between
|
||||
* compression algorithms that have various compression and uncompression
|
||||
* speeds so that you can pick the one that best fits your indexing and
|
||||
* searching throughput.
|
||||
* <p>
|
||||
* <code>chunkSize</code> is the minimum byte size of a chunk of documents.
|
||||
* A value of <code>1</code> can make sense if there is redundancy across
|
||||
* fields. In that case, both performance and compression ratio should be
|
||||
* better than with {@link Lucene40StoredFieldsFormat} with compressed
|
||||
* fields.
|
||||
* <p>
|
||||
* Higher values of <code>chunkSize</code> should improve the compression
|
||||
* atio but will require more memory at indexing time and might make document
|
||||
* loading a little slower (depending on the size of your OS cache compared
|
||||
* to the size of your index).
|
||||
* <p>
|
||||
* The <code>storedFieldsIndexFormat</code> parameter allows you to choose
|
||||
* bettwen several fields index formats that offer various trade-offs between
|
||||
* memory usage and speed.
|
||||
*
|
||||
* @param compressionMode the {@link CompressionMode} to use
|
||||
* @param chunkSize the minimum number of bytes of a single chunk of stored documents
|
||||
* @param storedFieldsIndexFormat the format to use to load the fields index
|
||||
* @see CompressionMode
|
||||
* @see CompressingStoredFieldsIndex
|
||||
*/
|
||||
public CompressingStoredFieldsFormat(CompressionMode compressionMode, int chunkSize,
|
||||
CompressingStoredFieldsIndex storedFieldsIndexFormat) {
|
||||
this.compressionMode = compressionMode;
|
||||
if (chunkSize < 1) {
|
||||
throw new IllegalArgumentException("chunkSize must be >= 1");
|
||||
}
|
||||
this.chunkSize = chunkSize;
|
||||
this.storedFieldsIndexFormat = storedFieldsIndexFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link CompressingStoredFieldsFormat} with an in-memory
|
||||
* {@link CompressingStoredFieldsIndex}.
|
||||
*
|
||||
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int, CompressingStoredFieldsIndex)
|
||||
*/
|
||||
public CompressingStoredFieldsFormat(CompressionMode compressionMode, int chunkSize) {
|
||||
this (compressionMode, chunkSize, chunkSize == 1
|
||||
? CompressingStoredFieldsIndex.MEMORY_DOC
|
||||
: CompressingStoredFieldsIndex.MEMORY_CHUNK);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link CompressingStoredFieldsFormat} with
|
||||
* {@link CompressionMode#FAST} compression and chunks of <tt>16 KB</tt>.
|
||||
*
|
||||
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int)
|
||||
*/
|
||||
public CompressingStoredFieldsFormat() {
|
||||
this(CompressionMode.FAST, 1 << 14);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si,
|
||||
FieldInfos fn, IOContext context) throws IOException {
|
||||
return new CompressingStoredFieldsReader(directory, si, fn, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
|
||||
IOContext context) throws IOException {
|
||||
return new CompressingStoredFieldsWriter(directory, si, context,
|
||||
compressionMode, chunkSize, storedFieldsIndexFormat);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,411 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.GrowableWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* A format for the stored fields index file (.fdx).
|
||||
* <p>
|
||||
* These formats allow different memory/speed trade-offs to locate documents
|
||||
* into the fields data file (.fdt).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public enum CompressingStoredFieldsIndex {
|
||||
|
||||
/**
|
||||
* This format stores the document index on disk using 64-bits pointers to
|
||||
* the start offsets of chunks in the fields data file.
|
||||
* <p>
|
||||
* This format has no memory overhead and requires at most 1 disk seek to
|
||||
* locate a document in the fields data file. Use this format in
|
||||
* memory-constrained environments.
|
||||
*/
|
||||
DISK_DOC(0) {
|
||||
@Override
|
||||
Writer newWriter(IndexOutput out) {
|
||||
return new DiskDocFieldsIndexWriter(out);
|
||||
}
|
||||
@Override
|
||||
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
|
||||
return new DiskDocFieldsIndexReader(in, si);
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* For every document in the segment, this format stores the offset of the
|
||||
* compressed chunk that contains it in the fields data file.
|
||||
* <p>
|
||||
* This fields index format requires at most <code>8 * numDocs</code> bytes
|
||||
* of memory. Locating a document in the fields data file requires no disk
|
||||
* seek. Use this format when blocks are very likely to contain few
|
||||
* documents (in particular when <code>chunkSize = 1</code>).
|
||||
*/
|
||||
MEMORY_DOC(1) {
|
||||
@Override
|
||||
Writer newWriter(IndexOutput out) throws IOException {
|
||||
return new ChunksFieldsIndexWriter(out);
|
||||
}
|
||||
@Override
|
||||
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
|
||||
return new MemoryDocFieldsIndexReader(in, si);
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* For every chunk of compressed documents, this format stores the first doc
|
||||
* ID of the chunk as well as the start offset of the chunk.
|
||||
* <p>
|
||||
* This fields index format require at most
|
||||
* <code>12 * numChunks</code> bytes of memory. Locating a document in the
|
||||
* fields data file requires no disk seek. Use this format when chunks are
|
||||
* likely to contain several documents.
|
||||
*/
|
||||
MEMORY_CHUNK(2) {
|
||||
@Override
|
||||
Writer newWriter(IndexOutput out) throws IOException {
|
||||
return new ChunksFieldsIndexWriter(out);
|
||||
}
|
||||
@Override
|
||||
Reader newReader(IndexInput in, SegmentInfo si) throws IOException {
|
||||
return new MemoryChunkFieldsIndexReader(in, si);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Retrieve a {@link CompressingStoredFieldsIndex} according to its
|
||||
* <code>ID</code>.
|
||||
*/
|
||||
public static CompressingStoredFieldsIndex byId(int id) {
|
||||
for (CompressingStoredFieldsIndex idx : CompressingStoredFieldsIndex.values()) {
|
||||
if (idx.getId() == id) {
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unknown id: " + id);
|
||||
}
|
||||
|
||||
private final int id;
|
||||
|
||||
private CompressingStoredFieldsIndex(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an ID for this compression mode. Should be unique across
|
||||
* {@link CompressionMode}s as it is used for serialization and
|
||||
* unserialization.
|
||||
*/
|
||||
public final int getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
abstract Writer newWriter(IndexOutput out) throws IOException;
|
||||
|
||||
abstract Reader newReader(IndexInput in, SegmentInfo si) throws IOException;
|
||||
|
||||
static abstract class Writer implements Closeable {
|
||||
|
||||
protected final IndexOutput fieldsIndexOut;
|
||||
|
||||
Writer(IndexOutput indexOutput) {
|
||||
this.fieldsIndexOut = indexOutput;
|
||||
}
|
||||
|
||||
/** Write the index file for a chunk of <code>numDocs</code> docs starting
|
||||
* at offset <code>startPointer</code>. */
|
||||
abstract void writeIndex(int numDocs, long startPointer) throws IOException;
|
||||
|
||||
/** Finish writing an index file of <code>numDocs</code> documents. */
|
||||
abstract void finish(int numDocs) throws IOException;
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
fieldsIndexOut.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class DiskDocFieldsIndexWriter extends Writer {
|
||||
|
||||
final long startOffset;
|
||||
|
||||
DiskDocFieldsIndexWriter(IndexOutput fieldsIndexOut) {
|
||||
super(fieldsIndexOut);
|
||||
startOffset = fieldsIndexOut.getFilePointer();
|
||||
}
|
||||
|
||||
@Override
|
||||
void writeIndex(int numDocs, long startPointer) throws IOException {
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
fieldsIndexOut.writeLong(startPointer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
void finish(int numDocs) throws IOException {
|
||||
if (startOffset + ((long) numDocs) * 8 != fieldsIndexOut.getFilePointer()) {
|
||||
// see Lucene40StoredFieldsWriter#finish
|
||||
throw new RuntimeException((fieldsIndexOut.getFilePointer() - startOffset)/8 + " fdx size mismatch: docCount is " + numDocs + " but fdx file size is " + fieldsIndexOut.getFilePointer() + " file=" + fieldsIndexOut.toString() + "; now aborting this merge to prevent index corruption");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class ChunksFieldsIndexWriter extends Writer {
|
||||
|
||||
int numChunks;
|
||||
long maxStartPointer;
|
||||
GrowableWriter docBaseDeltas;
|
||||
GrowableWriter startPointerDeltas;
|
||||
|
||||
ChunksFieldsIndexWriter(IndexOutput indexOutput) {
|
||||
super(indexOutput);
|
||||
numChunks = 0;
|
||||
maxStartPointer = 0;
|
||||
docBaseDeltas = new GrowableWriter(2, 128, PackedInts.COMPACT);
|
||||
startPointerDeltas = new GrowableWriter(5, 128, PackedInts.COMPACT);
|
||||
}
|
||||
|
||||
@Override
|
||||
void writeIndex(int numDocs, long startPointer) throws IOException {
|
||||
if (numChunks == docBaseDeltas.size()) {
|
||||
final int newSize = ArrayUtil.oversize(numChunks + 1, 1);
|
||||
docBaseDeltas = docBaseDeltas.resize(newSize);
|
||||
startPointerDeltas = startPointerDeltas.resize(newSize);
|
||||
}
|
||||
docBaseDeltas.set(numChunks, numDocs);
|
||||
startPointerDeltas.set(numChunks, startPointer - maxStartPointer);
|
||||
|
||||
++numChunks;
|
||||
maxStartPointer = startPointer;
|
||||
}
|
||||
|
||||
@Override
|
||||
void finish(int numDocs) throws IOException {
|
||||
if (numChunks != docBaseDeltas.size()) {
|
||||
docBaseDeltas = docBaseDeltas.resize(numChunks);
|
||||
startPointerDeltas = startPointerDeltas.resize(numChunks);
|
||||
}
|
||||
fieldsIndexOut.writeVInt(numChunks);
|
||||
fieldsIndexOut.writeByte((byte) PackedInts.bitsRequired(maxStartPointer));
|
||||
docBaseDeltas.save(fieldsIndexOut);
|
||||
startPointerDeltas.save(fieldsIndexOut);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static abstract class Reader implements Cloneable, Closeable {
|
||||
|
||||
protected final IndexInput fieldsIndexIn;
|
||||
|
||||
Reader(IndexInput fieldsIndexIn) {
|
||||
this.fieldsIndexIn = fieldsIndexIn;
|
||||
}
|
||||
|
||||
/** Get the start pointer of the compressed block that contains docID */
|
||||
abstract long getStartPointer(int docID) throws IOException;
|
||||
|
||||
public void close() throws IOException {
|
||||
if (fieldsIndexIn != null) {
|
||||
fieldsIndexIn.close();
|
||||
}
|
||||
}
|
||||
|
||||
public abstract Reader clone();
|
||||
}
|
||||
|
||||
private static class DiskDocFieldsIndexReader extends Reader {
|
||||
|
||||
final long startPointer;
|
||||
|
||||
DiskDocFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws CorruptIndexException {
|
||||
this(fieldsIndexIn, fieldsIndexIn.getFilePointer());
|
||||
final long indexSize = fieldsIndexIn.length() - fieldsIndexIn.getFilePointer();
|
||||
final int numDocs = (int) (indexSize >> 3);
|
||||
// Verify two sources of "maxDoc" agree:
|
||||
if (numDocs != si.getDocCount()) {
|
||||
throw new CorruptIndexException("doc counts differ for segment " + si + ": fieldsReader shows " + numDocs + " but segmentInfo shows " + si.getDocCount());
|
||||
}
|
||||
}
|
||||
|
||||
private DiskDocFieldsIndexReader(IndexInput fieldsIndexIn, long startPointer) {
|
||||
super(fieldsIndexIn);
|
||||
this.startPointer = startPointer;
|
||||
}
|
||||
|
||||
@Override
|
||||
long getStartPointer(int docID) throws IOException {
|
||||
fieldsIndexIn.seek(startPointer + docID * 8L);
|
||||
return fieldsIndexIn.readLong();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader clone() {
|
||||
return new DiskDocFieldsIndexReader(fieldsIndexIn.clone(), startPointer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class MemoryDocFieldsIndexReader extends Reader {
|
||||
|
||||
private final PackedInts.Reader startPointers;
|
||||
|
||||
MemoryDocFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
|
||||
super(fieldsIndexIn);
|
||||
final int numChunks = fieldsIndexIn.readVInt();
|
||||
final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
|
||||
if (bitsPerStartPointer > 64) {
|
||||
throw new CorruptIndexException("Corrupted");
|
||||
}
|
||||
|
||||
final PackedInts.Reader chunkDocs = PackedInts.getReader(fieldsIndexIn);
|
||||
if (chunkDocs.size() != numChunks) {
|
||||
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + chunkDocs.size());
|
||||
}
|
||||
|
||||
final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
if (startPointerDeltas.size() != numChunks) {
|
||||
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
|
||||
}
|
||||
final PackedInts.Mutable startPointers = PackedInts.getMutable(si.getDocCount(), bitsPerStartPointer, PackedInts.COMPACT);
|
||||
int docID = 0;
|
||||
long startPointer = 0;
|
||||
for (int i = 0; i < numChunks; ++i) {
|
||||
startPointer += startPointerDeltas.next();
|
||||
final int chunkDocCount = (int) chunkDocs.get(i);
|
||||
for (int j = 0; j < chunkDocCount; ++j) {
|
||||
startPointers.set(docID++, startPointer);
|
||||
}
|
||||
}
|
||||
if (docID != si.getDocCount()) {
|
||||
throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docID);
|
||||
}
|
||||
|
||||
this.startPointers = startPointers;
|
||||
}
|
||||
|
||||
private MemoryDocFieldsIndexReader(PackedInts.Reader startPointers) {
|
||||
super(null);
|
||||
this.startPointers = startPointers;
|
||||
}
|
||||
|
||||
@Override
|
||||
long getStartPointer(int docID) throws IOException {
|
||||
return startPointers.get(docID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader clone() {
|
||||
if (fieldsIndexIn == null) {
|
||||
return this;
|
||||
} else {
|
||||
return new MemoryDocFieldsIndexReader(startPointers);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class MemoryChunkFieldsIndexReader extends Reader {
|
||||
|
||||
private final PackedInts.Reader docBases;
|
||||
private final PackedInts.Reader startPointers;
|
||||
|
||||
MemoryChunkFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
|
||||
super(fieldsIndexIn);
|
||||
final int numChunks = fieldsIndexIn.readVInt();
|
||||
final int bitsPerStartPointer = fieldsIndexIn.readByte() & 0xFF;
|
||||
if (bitsPerStartPointer > 64) {
|
||||
throw new CorruptIndexException("Corrupted");
|
||||
}
|
||||
|
||||
final PackedInts.ReaderIterator docBaseDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
if (docBaseDeltas.size() != numChunks) {
|
||||
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + docBaseDeltas.size());
|
||||
}
|
||||
final PackedInts.Mutable docBases = PackedInts.getMutable(numChunks, PackedInts.bitsRequired(Math.max(0, si.getDocCount() - 1)), PackedInts.COMPACT);
|
||||
|
||||
int docBase = 0;
|
||||
for (int i = 0; i < numChunks; ++i) {
|
||||
docBases.set(i, docBase);
|
||||
docBase += docBaseDeltas.next();
|
||||
}
|
||||
if (docBase != si.getDocCount()) {
|
||||
throw new CorruptIndexException("Expected " + si.getDocCount() + " docs, got " + docBase);
|
||||
}
|
||||
|
||||
final PackedInts.ReaderIterator startPointerDeltas = PackedInts.getReaderIterator(fieldsIndexIn, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
if (startPointerDeltas.size() != numChunks) {
|
||||
throw new CorruptIndexException("Expected " + numChunks + " chunks, but got " + startPointerDeltas.size());
|
||||
}
|
||||
final PackedInts.Mutable startPointers = PackedInts.getMutable(numChunks, bitsPerStartPointer, PackedInts.COMPACT);
|
||||
int startPointer = 0;
|
||||
for (int i = 0; i < numChunks; ++i) {
|
||||
startPointer += startPointerDeltas.next();
|
||||
startPointers.set(i, startPointer);
|
||||
}
|
||||
|
||||
this.docBases = docBases;
|
||||
this.startPointers = startPointers;
|
||||
}
|
||||
|
||||
private MemoryChunkFieldsIndexReader(PackedInts.Reader docBases, PackedInts.Reader startPointers) {
|
||||
super(null);
|
||||
this.docBases = docBases;
|
||||
this.startPointers = startPointers;
|
||||
}
|
||||
|
||||
@Override
|
||||
long getStartPointer(int docID) {
|
||||
assert docBases.size() > 0;
|
||||
int lo = 0, hi = docBases.size() - 1;
|
||||
while (lo <= hi) {
|
||||
final int mid = (lo + hi) >>> 1;
|
||||
final long midValue = docBases.get(mid);
|
||||
if (midValue == docID) {
|
||||
return startPointers.get(mid);
|
||||
} else if (midValue < docID) {
|
||||
lo = mid + 1;
|
||||
} else {
|
||||
hi = mid - 1;
|
||||
}
|
||||
}
|
||||
return startPointers.get(hi);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader clone() {
|
||||
if (fieldsIndexIn == null) {
|
||||
return this;
|
||||
} else {
|
||||
return new MemoryChunkFieldsIndexReader(docBases, startPointers);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,341 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_NAME_DAT;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.*;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_DAT;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_IDX;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
|
||||
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
|
||||
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
||||
|
||||
private final FieldInfos fieldInfos;
|
||||
private final CompressingStoredFieldsIndex.Reader indexReader;
|
||||
private final IndexInput fieldsStream;
|
||||
private final int packedIntsVersion;
|
||||
private final CompressionMode compressionMode;
|
||||
private final Uncompressor uncompressor;
|
||||
private final BytesRef bytes;
|
||||
private final int numDocs;
|
||||
private boolean closed;
|
||||
|
||||
// used by clone
|
||||
private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader) {
|
||||
this.fieldInfos = reader.fieldInfos;
|
||||
this.fieldsStream = reader.fieldsStream.clone();
|
||||
this.indexReader = reader.indexReader.clone();
|
||||
this.packedIntsVersion = reader.packedIntsVersion;
|
||||
this.compressionMode = reader.compressionMode;
|
||||
this.uncompressor = reader.uncompressor.clone();
|
||||
this.numDocs = reader.numDocs;
|
||||
this.bytes = new BytesRef(reader.bytes.bytes.length);
|
||||
this.closed = false;
|
||||
}
|
||||
|
||||
public CompressingStoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
||||
final String segment = si.name;
|
||||
boolean success = false;
|
||||
fieldInfos = fn;
|
||||
numDocs = si.getDocCount();
|
||||
IndexInput indexStream = null;
|
||||
try {
|
||||
fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
|
||||
final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION);
|
||||
indexStream = d.openInput(indexStreamFN, context);
|
||||
|
||||
CodecUtil.checkHeader(indexStream, CODEC_NAME_IDX, VERSION_START, VERSION_CURRENT);
|
||||
CodecUtil.checkHeader(fieldsStream, CODEC_NAME_DAT, VERSION_START, VERSION_CURRENT);
|
||||
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
|
||||
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
|
||||
|
||||
final int storedFieldsIndexId = indexStream.readVInt();
|
||||
final CompressingStoredFieldsIndex storedFieldsIndex = CompressingStoredFieldsIndex.byId(storedFieldsIndexId);
|
||||
indexReader = storedFieldsIndex.newReader(indexStream, si);
|
||||
indexStream = null;
|
||||
|
||||
packedIntsVersion = fieldsStream.readVInt();
|
||||
final int compressionModeId = fieldsStream.readVInt();
|
||||
compressionMode = CompressionMode.byId(compressionModeId);
|
||||
uncompressor = compressionMode.newUncompressor();
|
||||
this.bytes = new BytesRef();
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this, indexStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws AlreadyClosedException if this FieldsReader is closed
|
||||
*/
|
||||
private void ensureOpen() throws AlreadyClosedException {
|
||||
if (closed) {
|
||||
throw new AlreadyClosedException("this FieldsReader is closed");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (!closed) {
|
||||
IOUtils.close(fieldsStream, indexReader);
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
|
||||
private static void readField(ByteArrayDataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException {
|
||||
switch (bits & TYPE_MASK) {
|
||||
case BYTE_ARR:
|
||||
int length = in.readVInt();
|
||||
byte[] data = new byte[length];
|
||||
in.readBytes(data, 0, length);
|
||||
visitor.binaryField(info, data);
|
||||
break;
|
||||
case STRING:
|
||||
length = in.readVInt();
|
||||
data = new byte[length];
|
||||
in.readBytes(data, 0, length);
|
||||
visitor.stringField(info, new String(data, IOUtils.CHARSET_UTF_8));
|
||||
break;
|
||||
case NUMERIC_INT:
|
||||
visitor.intField(info, in.readInt());
|
||||
break;
|
||||
case NUMERIC_FLOAT:
|
||||
visitor.floatField(info, Float.intBitsToFloat(in.readInt()));
|
||||
break;
|
||||
case NUMERIC_LONG:
|
||||
visitor.longField(info, in.readLong());
|
||||
break;
|
||||
case NUMERIC_DOUBLE:
|
||||
visitor.doubleField(info, Double.longBitsToDouble(in.readLong()));
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
|
||||
}
|
||||
}
|
||||
|
||||
private static void skipField(ByteArrayDataInput in, int bits) throws IOException {
|
||||
switch (bits & TYPE_MASK) {
|
||||
case BYTE_ARR:
|
||||
case STRING:
|
||||
final int length = in.readVInt();
|
||||
in.skipBytes(length);
|
||||
break;
|
||||
case NUMERIC_INT:
|
||||
case NUMERIC_FLOAT:
|
||||
in.readInt();
|
||||
break;
|
||||
case NUMERIC_LONG:
|
||||
case NUMERIC_DOUBLE:
|
||||
in.readLong();
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitDocument(int docID, StoredFieldVisitor visitor)
|
||||
throws IOException {
|
||||
fieldsStream.seek(indexReader.getStartPointer(docID));
|
||||
|
||||
final int docBase = fieldsStream.readVInt();
|
||||
final int chunkDocs = fieldsStream.readVInt();
|
||||
final int bitsPerValue = fieldsStream.readVInt();
|
||||
if (docID < docBase
|
||||
|| docID >= docBase + chunkDocs
|
||||
|| docBase + chunkDocs > numDocs
|
||||
|| bitsPerValue > 31) {
|
||||
throw new CorruptIndexException("Corrupted: docID=" + docID
|
||||
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs
|
||||
+ ", numDocs=" + numDocs + ", bitsPerValue=" + bitsPerValue);
|
||||
}
|
||||
|
||||
final long filePointer = fieldsStream.getFilePointer();
|
||||
final PackedInts.ReaderIterator lengths = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 1);
|
||||
int offset = 0;
|
||||
for (int i = docBase; i < docID; ++i) {
|
||||
offset += lengths.next();
|
||||
}
|
||||
final int length = (int) lengths.next();
|
||||
// skip the last values
|
||||
fieldsStream.seek(filePointer + (PackedInts.Format.PACKED.nblocks(bitsPerValue, chunkDocs) << 3));
|
||||
|
||||
uncompressor.uncompress(fieldsStream, offset, length, bytes);
|
||||
|
||||
final ByteArrayDataInput documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
|
||||
final int numFields = documentInput.readVInt();
|
||||
for (int fieldIDX = 0; fieldIDX < numFields; fieldIDX++) {
|
||||
final long infoAndBits = documentInput.readVLong();
|
||||
final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
|
||||
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
|
||||
|
||||
final int bits = (int) (infoAndBits & TYPE_MASK);
|
||||
assert bits <= NUMERIC_DOUBLE: "bits=" + Integer.toHexString(bits);
|
||||
|
||||
switch(visitor.needsField(fieldInfo)) {
|
||||
case YES:
|
||||
readField(documentInput, visitor, fieldInfo, bits);
|
||||
break;
|
||||
case NO:
|
||||
skipField(documentInput, bits);
|
||||
break;
|
||||
case STOP:
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsReader clone() {
|
||||
ensureOpen();
|
||||
return new CompressingStoredFieldsReader(this);
|
||||
}
|
||||
|
||||
CompressionMode getCompressionMode() {
|
||||
return compressionMode;
|
||||
}
|
||||
|
||||
ChunkIterator chunkIterator(int startDocID) throws IOException {
|
||||
ensureOpen();
|
||||
fieldsStream.seek(indexReader.getStartPointer(startDocID));
|
||||
return new ChunkIterator();
|
||||
}
|
||||
|
||||
final class ChunkIterator {
|
||||
|
||||
BytesRef bytes;
|
||||
int docBase;
|
||||
int chunkDocs;
|
||||
int[] lengths;
|
||||
|
||||
private ChunkIterator() {
|
||||
this.docBase = -1;
|
||||
bytes = new BytesRef();
|
||||
lengths = new int[0];
|
||||
}
|
||||
|
||||
private int readHeader() throws IOException {
|
||||
final int docBase = fieldsStream.readVInt();
|
||||
final int chunkDocs = fieldsStream.readVInt();
|
||||
final int bitsPerValue = fieldsStream.readVInt();
|
||||
if (docBase < this.docBase + this.chunkDocs
|
||||
|| docBase + chunkDocs > numDocs
|
||||
|| bitsPerValue > 31) {
|
||||
throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
|
||||
+ ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
|
||||
+ ", new numDocs=" + chunkDocs + ", bitsPerValue=" + bitsPerValue);
|
||||
}
|
||||
this.docBase = docBase;
|
||||
this.chunkDocs = chunkDocs;
|
||||
return bitsPerValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the uncompressed size of the chunk
|
||||
*/
|
||||
int chunkSize() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < chunkDocs; ++i) {
|
||||
sum += lengths[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Go to the chunk containing the provided doc ID.
|
||||
*/
|
||||
void next(int doc) throws IOException {
|
||||
assert doc >= docBase + chunkDocs : doc + " " + docBase + " " + chunkDocs;
|
||||
// try next chunk
|
||||
int bitsPerValue = readHeader();
|
||||
if (docBase + chunkDocs <= doc) {
|
||||
// doc is not in the next chunk, use seek to skip to the next document chunk
|
||||
fieldsStream.seek(indexReader.getStartPointer(doc));
|
||||
bitsPerValue = readHeader();
|
||||
}
|
||||
if (doc < docBase
|
||||
|| doc >= docBase + chunkDocs) {
|
||||
throw new CorruptIndexException("Corrupted: docID=" + doc
|
||||
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs);
|
||||
}
|
||||
|
||||
// decode lengths
|
||||
if (lengths.length < chunkDocs) {
|
||||
lengths = new int[ArrayUtil.oversize(chunkDocs, 4)];
|
||||
}
|
||||
final PackedInts.ReaderIterator iterator = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 0);
|
||||
for (int i = 0; i < chunkDocs; ++i) {
|
||||
lengths[i] = (int) iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uncompress the chunk.
|
||||
*/
|
||||
void uncompress() throws IOException {
|
||||
// uncompress data
|
||||
uncompressor.uncompress(fieldsStream, bytes);
|
||||
if (bytes.length != chunkSize()) {
|
||||
throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy compressed data.
|
||||
*/
|
||||
void copyCompressedData(DataOutput out) throws IOException {
|
||||
uncompressor.copyCompressedData(fieldsStream, out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,391 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
|
||||
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.ChunkIterator;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentReader;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||
|
||||
static final int STRING = 0x00;
|
||||
static final int BYTE_ARR = 0x01;
|
||||
static final int NUMERIC_INT = 0x02;
|
||||
static final int NUMERIC_FLOAT = 0x03;
|
||||
static final int NUMERIC_LONG = 0x04;
|
||||
static final int NUMERIC_DOUBLE = 0x05;
|
||||
|
||||
static final int TYPE_BITS = PackedInts.bitsRequired(NUMERIC_DOUBLE);
|
||||
static final int TYPE_MASK = (int) PackedInts.maxValue(TYPE_BITS);
|
||||
|
||||
static final String CODEC_NAME_IDX = "CompressingStoredFieldsIndex";
|
||||
static final String CODEC_NAME_DAT = "CompressingStoredFieldsData";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
static final long HEADER_LENGTH_IDX = CodecUtil.headerLength(CODEC_NAME_IDX);
|
||||
static final long HEADER_LENGTH_DAT = CodecUtil.headerLength(CODEC_NAME_DAT);
|
||||
|
||||
private final Directory directory;
|
||||
private final String segment;
|
||||
private CompressingStoredFieldsIndex.Writer indexWriter;
|
||||
private IndexOutput fieldsStream;
|
||||
|
||||
private final CompressionMode compressionMode;
|
||||
private final Compressor compressor;
|
||||
private final int chunkSize;
|
||||
|
||||
private final GrowableByteArrayDataOutput bufferedDocs;
|
||||
private int[] endOffsets; // end offsets in bufferedDocs
|
||||
private int docBase; // doc ID at the beginning of the chunk
|
||||
private int numBufferedDocs; // docBase + numBufferedDocs == current doc ID
|
||||
|
||||
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si,
|
||||
IOContext context, CompressionMode compressionMode, int chunkSize, CompressingStoredFieldsIndex storedFieldsIndex) throws IOException {
|
||||
assert directory != null;
|
||||
this.directory = directory;
|
||||
this.segment = si.name;
|
||||
this.compressionMode = compressionMode;
|
||||
this.compressor = compressionMode.newCompressor();
|
||||
this.chunkSize = chunkSize;
|
||||
this.docBase = 0;
|
||||
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
|
||||
this.endOffsets = new int[16];
|
||||
this.numBufferedDocs = 0;
|
||||
|
||||
boolean success = false;
|
||||
IndexOutput indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context);
|
||||
try {
|
||||
fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
|
||||
|
||||
CodecUtil.writeHeader(indexStream, CODEC_NAME_IDX, VERSION_CURRENT);
|
||||
CodecUtil.writeHeader(fieldsStream, CODEC_NAME_DAT, VERSION_CURRENT);
|
||||
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
|
||||
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
|
||||
|
||||
indexStream.writeVInt(storedFieldsIndex.getId());
|
||||
indexWriter = storedFieldsIndex.newWriter(indexStream);
|
||||
indexStream = null;
|
||||
|
||||
fieldsStream.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
fieldsStream.writeVInt(compressionMode.getId());
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(indexStream);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
IOUtils.close(fieldsStream, indexWriter);
|
||||
} finally {
|
||||
fieldsStream = null;
|
||||
indexWriter = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void endWithPreviousDocument() throws IOException {
|
||||
if (numBufferedDocs > 0) {
|
||||
assert bufferedDocs.length > 0;
|
||||
if (numBufferedDocs == endOffsets.length) {
|
||||
endOffsets = ArrayUtil.grow(endOffsets);
|
||||
}
|
||||
endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
|
||||
}
|
||||
}
|
||||
|
||||
private void addRawDocument(byte[] buf, int off, int len) throws IOException {
|
||||
endWithPreviousDocument();
|
||||
if (bufferedDocs.length >= chunkSize) {
|
||||
flush();
|
||||
}
|
||||
bufferedDocs.writeBytes(buf, off, len);
|
||||
++numBufferedDocs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDocument(int numStoredFields) throws IOException {
|
||||
endWithPreviousDocument();
|
||||
if (bufferedDocs.length >= chunkSize) {
|
||||
flush();
|
||||
}
|
||||
bufferedDocs.writeVInt(numStoredFields);
|
||||
++numBufferedDocs;
|
||||
}
|
||||
|
||||
private void writeHeader(int docBase, int numBufferedDocs, int[] lengths) throws IOException {
|
||||
// save docBase and numBufferedDocs
|
||||
fieldsStream.writeVInt(docBase);
|
||||
fieldsStream.writeVInt(numBufferedDocs);
|
||||
|
||||
// save lengths
|
||||
final int bitsRequired = bitsRequired(lengths, numBufferedDocs);
|
||||
assert bitsRequired <= 31;
|
||||
fieldsStream.writeVInt(bitsRequired);
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsStream, PackedInts.Format.PACKED, numBufferedDocs, bitsRequired, 1);
|
||||
for (int i = 0; i < numBufferedDocs; ++i) {
|
||||
assert lengths[i] > 0;
|
||||
writer.add(lengths[i]);
|
||||
}
|
||||
assert writer.ord() + 1 == numBufferedDocs;
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
private void flush() throws IOException {
|
||||
indexWriter.writeIndex(numBufferedDocs, fieldsStream.getFilePointer());
|
||||
|
||||
// transform end offsets into lengths
|
||||
final int[] lengths = endOffsets;
|
||||
for (int i = numBufferedDocs - 1; i > 0; --i) {
|
||||
lengths[i] = endOffsets[i] - endOffsets[i - 1];
|
||||
assert lengths[i] > 0;
|
||||
}
|
||||
writeHeader(docBase, numBufferedDocs, lengths);
|
||||
|
||||
// compress stored fields to fieldsStream
|
||||
compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
|
||||
|
||||
// reset
|
||||
docBase += numBufferedDocs;
|
||||
numBufferedDocs = 0;
|
||||
bufferedDocs.length = 0;
|
||||
}
|
||||
|
||||
private static int bitsRequired(int[] data, int length) {
|
||||
int or = data[0];
|
||||
for (int i = 1; i < length; ++i) {
|
||||
or |= data[i];
|
||||
}
|
||||
return PackedInts.bitsRequired(or);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo info, StorableField field)
|
||||
throws IOException {
|
||||
int bits = 0;
|
||||
final BytesRef bytes;
|
||||
final String string;
|
||||
|
||||
Number number = field.numericValue();
|
||||
if (number != null) {
|
||||
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
||||
bits = NUMERIC_INT;
|
||||
} else if (number instanceof Long) {
|
||||
bits = NUMERIC_LONG;
|
||||
} else if (number instanceof Float) {
|
||||
bits = NUMERIC_FLOAT;
|
||||
} else if (number instanceof Double) {
|
||||
bits = NUMERIC_DOUBLE;
|
||||
} else {
|
||||
throw new IllegalArgumentException("cannot store numeric type " + number.getClass());
|
||||
}
|
||||
string = null;
|
||||
bytes = null;
|
||||
} else {
|
||||
bytes = field.binaryValue();
|
||||
if (bytes != null) {
|
||||
bits = BYTE_ARR;
|
||||
string = null;
|
||||
} else {
|
||||
bits = STRING;
|
||||
string = field.stringValue();
|
||||
if (string == null) {
|
||||
throw new IllegalArgumentException("field " + field.name() + " is stored but does not have binaryValue, stringValue nor numericValue");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final long infoAndBits = (((long) info.number) << TYPE_BITS) | bits;
|
||||
bufferedDocs.writeVLong(infoAndBits);
|
||||
|
||||
if (bytes != null) {
|
||||
bufferedDocs.writeVInt(bytes.length);
|
||||
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
|
||||
} else if (string != null) {
|
||||
bufferedDocs.writeString(field.stringValue());
|
||||
} else {
|
||||
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
||||
bufferedDocs.writeInt(number.intValue());
|
||||
} else if (number instanceof Long) {
|
||||
bufferedDocs.writeLong(number.longValue());
|
||||
} else if (number instanceof Float) {
|
||||
bufferedDocs.writeInt(Float.floatToIntBits(number.floatValue()));
|
||||
} else if (number instanceof Double) {
|
||||
bufferedDocs.writeLong(Double.doubleToLongBits(number.doubleValue()));
|
||||
} else {
|
||||
throw new AssertionError("Cannot get here");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void abort() {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
IOUtils.deleteFilesIgnoringExceptions(directory,
|
||||
IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION),
|
||||
IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish(FieldInfos fis, int numDocs) throws IOException {
|
||||
endWithPreviousDocument();
|
||||
if (numBufferedDocs > 0) {
|
||||
flush();
|
||||
}
|
||||
if (docBase != numDocs) {
|
||||
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
|
||||
}
|
||||
indexWriter.finish(numDocs);
|
||||
assert bufferedDocs.length == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int merge(MergeState mergeState) throws IOException {
|
||||
int docCount = 0;
|
||||
int idx = 0;
|
||||
|
||||
for (AtomicReader reader : mergeState.readers) {
|
||||
final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
|
||||
CompressingStoredFieldsReader matchingFieldsReader = null;
|
||||
if (matchingSegmentReader != null) {
|
||||
final StoredFieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
|
||||
// we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
|
||||
if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
|
||||
matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
|
||||
}
|
||||
}
|
||||
|
||||
final int maxDoc = reader.maxDoc();
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
if (matchingFieldsReader == null) {
|
||||
// naive merge...
|
||||
for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
|
||||
StoredDocument doc = reader.document(i);
|
||||
addDocument(doc, mergeState.fieldInfos);
|
||||
++docCount;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
} else {
|
||||
int docID = nextLiveDoc(0, liveDocs, maxDoc);
|
||||
if (docID < maxDoc) {
|
||||
// not all docs were deleted
|
||||
final ChunkIterator it = matchingFieldsReader.chunkIterator(docID);
|
||||
int[] startOffsets = new int[0];
|
||||
do {
|
||||
// go to the next chunk that contains docID
|
||||
it.next(docID);
|
||||
// transform lengths into offsets
|
||||
if (startOffsets.length < it.chunkDocs) {
|
||||
startOffsets = new int[ArrayUtil.oversize(it.chunkDocs, 4)];
|
||||
}
|
||||
for (int i = 1; i < it.chunkDocs; ++i) {
|
||||
startOffsets[i] = startOffsets[i - 1] + it.lengths[i - 1];
|
||||
}
|
||||
|
||||
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
|
||||
&& (numBufferedDocs == 0 || bufferedDocs.length >= chunkSize) // starting a new chunk
|
||||
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
|
||||
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
|
||||
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
|
||||
assert docID == it.docBase;
|
||||
|
||||
// no need to uncompress, just copy data
|
||||
endWithPreviousDocument();
|
||||
if (bufferedDocs.length >= chunkSize) {
|
||||
flush();
|
||||
}
|
||||
indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
|
||||
writeHeader(this.docBase, it.chunkDocs, it.lengths);
|
||||
it.copyCompressedData(fieldsStream);
|
||||
this.docBase += it.chunkDocs;
|
||||
docID = nextLiveDoc(it.docBase + it.chunkDocs, liveDocs, maxDoc);
|
||||
docCount += it.chunkDocs;
|
||||
mergeState.checkAbort.work(300 * it.chunkDocs);
|
||||
} else {
|
||||
// uncompress
|
||||
it.uncompress();
|
||||
if (startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] != it.bytes.length) {
|
||||
throw new CorruptIndexException("Corrupted: expected chunk size=" + startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] + ", got " + it.bytes.length);
|
||||
}
|
||||
// copy non-deleted docs
|
||||
for (; docID < it.docBase + it.chunkDocs; docID = nextLiveDoc(docID + 1, liveDocs, maxDoc)) {
|
||||
final int diff = docID - it.docBase;
|
||||
addRawDocument(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
|
||||
++docCount;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
}
|
||||
} while (docID < maxDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
finish(mergeState.fieldInfos, docCount);
|
||||
return docCount;
|
||||
}
|
||||
|
||||
private static int nextLiveDoc(int doc, Bits liveDocs, int maxDoc) {
|
||||
if (liveDocs == null) {
|
||||
return doc;
|
||||
}
|
||||
while (doc < maxDoc && !liveDocs.get(doc)) {
|
||||
++doc;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
private static int nextDeletedDoc(int doc, Bits liveDocs, int maxDoc) {
|
||||
if (liveDocs == null) {
|
||||
return maxDoc;
|
||||
}
|
||||
while (doc < maxDoc && liveDocs.get(doc)) {
|
||||
++doc;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,343 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A compression mode. Tells how much effort should be spent on compression and
|
||||
* uncompression of stored fields.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public enum CompressionMode {
|
||||
|
||||
/**
|
||||
* A compression mode that trades compression ratio for speed. Although the
|
||||
* compression ratio might remain high, compression and uncompression are
|
||||
* very fast. Use this mode with indices that have a high update rate but
|
||||
* should be able to load documents from disk quickly.
|
||||
*/
|
||||
FAST(0) {
|
||||
|
||||
@Override
|
||||
Compressor newCompressor() {
|
||||
return LZ4_FAST_COMPRESSOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
Uncompressor newUncompressor() {
|
||||
return LZ4_UNCOMPRESSOR;
|
||||
}
|
||||
|
||||
},
|
||||
|
||||
/**
|
||||
* A compression mode that trades speed for compression ratio. Although
|
||||
* compression and uncompression might be slow, this compression mode should
|
||||
* provide a good compression ratio. This mode might be interesting if/when
|
||||
* your index size is much bigger than your OS cache.
|
||||
*/
|
||||
HIGH_COMPRESSION(1) {
|
||||
|
||||
@Override
|
||||
Compressor newCompressor() {
|
||||
return new DeflateCompressor(Deflater.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
Uncompressor newUncompressor() {
|
||||
return new DeflateUncompressor();
|
||||
}
|
||||
|
||||
},
|
||||
|
||||
/**
|
||||
* This compression mode is similar to {@link #FAST} but it spends more time
|
||||
* compressing in order to improve the compression ratio. This compression
|
||||
* mode is best used with indices that have a low update rate but should be
|
||||
* able to load documents from disk quickly.
|
||||
*/
|
||||
FAST_UNCOMPRESSION(2) {
|
||||
|
||||
@Override
|
||||
Compressor newCompressor() {
|
||||
return LZ4_HIGH_COMPRESSOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
Uncompressor newUncompressor() {
|
||||
return LZ4_UNCOMPRESSOR;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
public static CompressionMode byId(int id) {
|
||||
for (CompressionMode mode : CompressionMode.values()) {
|
||||
if (mode.getId() == id) {
|
||||
return mode;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unknown id: " + id);
|
||||
}
|
||||
|
||||
private final int id;
|
||||
|
||||
private CompressionMode(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an ID for this compression mode. Should be unique across
|
||||
* {@link CompressionMode}s as it is used for serialization and
|
||||
* unserialization.
|
||||
*/
|
||||
public final int getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link Compressor} instance.
|
||||
*/
|
||||
abstract Compressor newCompressor();
|
||||
|
||||
/**
|
||||
* Create a new {@link Uncompressor} instance.
|
||||
*/
|
||||
abstract Uncompressor newUncompressor();
|
||||
|
||||
|
||||
private static final Uncompressor LZ4_UNCOMPRESSOR = new Uncompressor() {
|
||||
|
||||
@Override
|
||||
public void uncompress(DataInput in, BytesRef bytes) throws IOException {
|
||||
final int uncompressedLen = in.readVInt();
|
||||
if (bytes.bytes.length < uncompressedLen + 8) {
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, uncompressedLen + 8);
|
||||
}
|
||||
LZ4.uncompress(in, uncompressedLen, bytes);
|
||||
if (bytes.length != uncompressedLen) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void uncompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
|
||||
final int uncompressedLen = in.readVInt();
|
||||
if (offset > uncompressedLen) {
|
||||
bytes.length = 0;
|
||||
return;
|
||||
}
|
||||
if (bytes.bytes.length < uncompressedLen) {
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, uncompressedLen);
|
||||
}
|
||||
LZ4.uncompress(in, offset + length, bytes);
|
||||
bytes.offset = offset;
|
||||
if (offset + length >= uncompressedLen) {
|
||||
if (bytes.length != uncompressedLen) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
bytes.length = uncompressedLen - offset;
|
||||
} else {
|
||||
bytes.length = length;
|
||||
}
|
||||
}
|
||||
|
||||
public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
|
||||
final int uncompressedLen = in.readVInt();
|
||||
out.writeVInt(uncompressedLen);
|
||||
if (uncompressedLen == 0) {
|
||||
out.writeByte((byte) 0); // the token
|
||||
return;
|
||||
}
|
||||
int n = 0;
|
||||
while (n < uncompressedLen) {
|
||||
// literals
|
||||
final byte token = in.readByte();
|
||||
out.writeByte(token);
|
||||
int literalLen = (token & 0xFF) >>> 4;
|
||||
if (literalLen == 0x0F) {
|
||||
byte len;
|
||||
while ((len = in.readByte()) == (byte) 0xFF) {
|
||||
literalLen += 0xFF;
|
||||
out.writeByte(len);
|
||||
}
|
||||
literalLen += len & 0xFF;
|
||||
out.writeByte(len);
|
||||
}
|
||||
out.copyBytes(in, literalLen);
|
||||
n += literalLen;
|
||||
if (n >= uncompressedLen) {
|
||||
break;
|
||||
}
|
||||
|
||||
// matchs
|
||||
out.copyBytes(in, 2); // match dec
|
||||
int matchLen = token & 0x0F;
|
||||
if (matchLen == 0x0F) {
|
||||
byte len;
|
||||
while ((len = in.readByte()) == (byte) 0xFF) {
|
||||
matchLen += 0xFF;
|
||||
out.writeByte(len);
|
||||
}
|
||||
matchLen += len & 0xFF;
|
||||
out.writeByte(len);
|
||||
}
|
||||
matchLen += LZ4.MIN_MATCH;
|
||||
n += matchLen;
|
||||
}
|
||||
|
||||
if (n != uncompressedLen) {
|
||||
throw new IOException("Currupted compressed stream: expected " + uncompressedLen + " bytes, but got at least" + n);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Uncompressor clone() {
|
||||
return this;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
private static final Compressor LZ4_FAST_COMPRESSOR = new Compressor() {
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out)
|
||||
throws IOException {
|
||||
out.writeVInt(len);
|
||||
LZ4.compress(bytes, off, len, out);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
private static final Compressor LZ4_HIGH_COMPRESSOR = new Compressor() {
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out)
|
||||
throws IOException {
|
||||
out.writeVInt(len);
|
||||
LZ4.compressHC(bytes, off, len, out);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
private static final class DeflateUncompressor extends Uncompressor {
|
||||
|
||||
final Inflater uncompressor;
|
||||
byte[] compressed;
|
||||
|
||||
DeflateUncompressor() {
|
||||
uncompressor = new Inflater();
|
||||
compressed = new byte[0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void uncompress(DataInput in, BytesRef bytes) throws IOException {
|
||||
bytes.offset = bytes.length = 0;
|
||||
|
||||
final int compressedLength = in.readVInt();
|
||||
if (compressedLength > compressed.length) {
|
||||
compressed = ArrayUtil.grow(compressed, compressedLength);
|
||||
}
|
||||
in.readBytes(compressed, 0, compressedLength);
|
||||
|
||||
uncompressor.reset();
|
||||
uncompressor.setInput(compressed, 0, compressedLength);
|
||||
if (uncompressor.needsInput()) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
final int count;
|
||||
try {
|
||||
final int remaining = bytes.bytes.length - bytes.length;
|
||||
count = uncompressor.inflate(bytes.bytes, bytes.length, remaining);
|
||||
} catch (DataFormatException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
bytes.length += count;
|
||||
if (uncompressor.finished()) {
|
||||
break;
|
||||
} else {
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
|
||||
final int compressedLength = in.readVInt();
|
||||
out.writeVInt(compressedLength);
|
||||
out.copyBytes(in, compressedLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Uncompressor clone() {
|
||||
return new DeflateUncompressor();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class DeflateCompressor extends Compressor {
|
||||
|
||||
final Deflater compressor;
|
||||
byte[] compressed;
|
||||
|
||||
DeflateCompressor(int level) {
|
||||
compressor = new Deflater(level);
|
||||
compressed = new byte[64];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
compressor.reset();
|
||||
compressor.setInput(bytes, off, len);
|
||||
compressor.finish();
|
||||
|
||||
if (compressor.needsInput()) {
|
||||
// no output
|
||||
out.writeVInt(0);
|
||||
return;
|
||||
}
|
||||
|
||||
int totalCount = 0;
|
||||
for (;;) {
|
||||
final int count = compressor.deflate(compressed, totalCount, compressed.length - totalCount);
|
||||
totalCount += count;
|
||||
assert totalCount <= compressed.length;
|
||||
if (compressor.finished()) {
|
||||
break;
|
||||
} else {
|
||||
compressed = ArrayUtil.grow(compressed);
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(totalCount);
|
||||
out.writeBytes(compressed, totalCount);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* A data compressor.
|
||||
*/
|
||||
abstract class Compressor {
|
||||
|
||||
/**
|
||||
* Compress bytes into <code>out</code>. It it the responsibility of the
|
||||
* compressor to add all necessary information so that a {@link Uncompressor}
|
||||
* will know when to stop uncompressing bytes from the stream.
|
||||
*/
|
||||
public abstract void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException;
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* A {@link DataOutput} that can be used to build a byte[].
|
||||
*/
|
||||
final class GrowableByteArrayDataOutput extends DataOutput {
|
||||
|
||||
byte[] bytes;
|
||||
int length;
|
||||
|
||||
GrowableByteArrayDataOutput(int cp) {
|
||||
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
|
||||
this.length = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) throws IOException {
|
||||
if (length >= bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes);
|
||||
}
|
||||
bytes[length++] = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int off, int len) throws IOException {
|
||||
final int newLength = length + len;
|
||||
if (newLength > bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes, newLength);
|
||||
}
|
||||
System.arraycopy(b, off, bytes, length, len);
|
||||
length = newLength;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,506 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* LZ4 compression and uncompression routines.
|
||||
*
|
||||
* http://code.google.com/p/lz4/
|
||||
* http://fastcompression.blogspot.fr/p/lz4.html
|
||||
*/
|
||||
class LZ4 {
|
||||
|
||||
private LZ4() {}
|
||||
|
||||
static final int MEMORY_USAGE = 14;
|
||||
static final int MIN_MATCH = 4; // minimum length of a match
|
||||
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
|
||||
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
|
||||
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
|
||||
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
|
||||
static final int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
|
||||
|
||||
|
||||
private static int hash(int i, int hashBits) {
|
||||
return (i * -1640531535) >>> (32 - hashBits);
|
||||
}
|
||||
|
||||
private static int hashHC(int i) {
|
||||
return hash(i, HASH_LOG_HC);
|
||||
}
|
||||
|
||||
private static int readInt(byte[] buf, int i) {
|
||||
return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF);
|
||||
}
|
||||
|
||||
private static boolean readIntEquals(byte[] buf, int i, int j) {
|
||||
return readInt(buf, i) == readInt(buf, j);
|
||||
}
|
||||
|
||||
private static int commonBytes(byte[] b, int o1, int o2, int limit) {
|
||||
assert o1 < o2;
|
||||
int count = 0;
|
||||
while (o2 < limit && b[o1++] == b[o2++]) {
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static int commonBytesBackward(byte[] b, int o1, int o2, int l1, int l2) {
|
||||
int count = 0;
|
||||
while (o1 > l1 && o2 > l2 && b[--o1] == b[--o2]) {
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Uncompress at least <code>uncompressedLen</code> bytes into <code>destBytes</code>.
|
||||
* Please note that <code>destBytes</code> must be large enough to be able to hold
|
||||
* <b>all</b> uncompressed data plus 8 bytes (meaning that you need to know the total
|
||||
* uncompressed length).
|
||||
*/
|
||||
public static void uncompress(DataInput compressed, int uncompressedLen, BytesRef destBytes) throws IOException {
|
||||
final byte[] dest = destBytes.bytes;
|
||||
final int destEnd = dest.length;
|
||||
int dOff = 0;
|
||||
|
||||
while (dOff < uncompressedLen) {
|
||||
// literals
|
||||
final int token = compressed.readByte() & 0xFF;
|
||||
int literalLen = token >>> 4;
|
||||
|
||||
if (literalLen != 0) {
|
||||
if (literalLen == 0x0F) {
|
||||
byte len;
|
||||
while ((len = compressed.readByte()) == (byte) 0xFF) {
|
||||
literalLen += 0xFF;
|
||||
}
|
||||
literalLen += len & 0xFF;
|
||||
}
|
||||
compressed.readBytes(dest, dOff, literalLen);
|
||||
dOff += literalLen;
|
||||
}
|
||||
|
||||
if (dOff >= uncompressedLen) {
|
||||
break;
|
||||
}
|
||||
|
||||
// matchs
|
||||
final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8);
|
||||
assert matchDec > 0;
|
||||
|
||||
int matchLen = token & 0x0F;
|
||||
if (matchLen == 0x0F) {
|
||||
int len;
|
||||
while ((len = compressed.readByte()) == (byte) 0xFF) {
|
||||
matchLen += 0xFF;
|
||||
}
|
||||
matchLen += len & 0xFF;
|
||||
}
|
||||
matchLen += MIN_MATCH;
|
||||
|
||||
// copying a multiple of 8 bytes can make uncompression from 5% to 10% faster
|
||||
final int fastLen = ((matchLen - 1) & 0xFFFFFFF8) + 8;
|
||||
if (matchDec < matchLen || dOff + fastLen > destEnd) {
|
||||
// overlap -> naive incremental copy
|
||||
for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) {
|
||||
dest[dOff] = dest[ref];
|
||||
}
|
||||
} else {
|
||||
// no overlap -> arraycopy
|
||||
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
|
||||
dOff += matchLen;
|
||||
}
|
||||
}
|
||||
destBytes.offset = 0;
|
||||
destBytes.length = dOff;
|
||||
}
|
||||
|
||||
private static void encodeLen(int l, DataOutput out) throws IOException {
|
||||
while (l >= 0xFF) {
|
||||
out.writeByte((byte) 0xFF);
|
||||
l -= 0xFF;
|
||||
}
|
||||
out.writeByte((byte) l);
|
||||
}
|
||||
|
||||
private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException {
|
||||
out.writeByte((byte) token);
|
||||
|
||||
// encode literal length
|
||||
if (literalLen >= 0x0F) {
|
||||
encodeLen(literalLen - 0x0F, out);
|
||||
}
|
||||
|
||||
// encode literals
|
||||
out.writeBytes(bytes, anchor, literalLen);
|
||||
}
|
||||
|
||||
private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException {
|
||||
final int token = Math.min(literalLen, 0x0F) << 4;
|
||||
encodeLiterals(bytes, token, anchor, literalLen, out);
|
||||
}
|
||||
|
||||
private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException {
|
||||
final int literalLen = matchOff - anchor;
|
||||
assert matchLen >= 4;
|
||||
// encode token
|
||||
final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F);
|
||||
encodeLiterals(bytes, token, anchor, literalLen, out);
|
||||
|
||||
// encode match dec
|
||||
final int matchDec = matchOff - matchRef;
|
||||
assert matchDec > 0 && matchDec < 1 << 16;
|
||||
out.writeByte((byte) matchDec);
|
||||
out.writeByte((byte) (matchDec >>> 8));
|
||||
|
||||
// encode match len
|
||||
if (matchLen >= MIN_MATCH + 0x0F) {
|
||||
encodeLen(matchLen - 0x0F - MIN_MATCH, out);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress <code>bytes[off:off+len]</code> into <code>out</code> using
|
||||
* 2<sup>hashLog</sup> bytes of memory. Higher values of <code>hashLog</code>
|
||||
* improve the compression ratio.
|
||||
*/
|
||||
public static void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
|
||||
final int base = off;
|
||||
final int end = off + len;
|
||||
|
||||
int anchor = off++;
|
||||
|
||||
if (len > LAST_LITERALS + MIN_MATCH) {
|
||||
|
||||
final int limit = end - LAST_LITERALS;
|
||||
final int matchLimit = limit - MIN_MATCH;
|
||||
|
||||
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
|
||||
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
|
||||
final int hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
|
||||
final PackedInts.Mutable hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT);
|
||||
|
||||
main:
|
||||
while (off < limit) {
|
||||
// find a match
|
||||
int ref;
|
||||
while (true) {
|
||||
if (off >= matchLimit) {
|
||||
break main;
|
||||
}
|
||||
final int v = readInt(bytes, off);
|
||||
final int h = hash(v, hashLog);
|
||||
ref = base + (int) hashTable.get(h);
|
||||
assert PackedInts.bitsRequired(off - base) <= hashTable.getBitsPerValue();
|
||||
hashTable.set(h, off - base);
|
||||
if (off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) {
|
||||
break;
|
||||
}
|
||||
++off;
|
||||
}
|
||||
|
||||
// compute match length
|
||||
final int matchLen = MIN_MATCH + commonBytes(bytes, ref + 4, off + 4, limit);
|
||||
|
||||
encodeSequence(bytes, anchor, ref, off, matchLen, out);
|
||||
off += matchLen;
|
||||
anchor = off;
|
||||
}
|
||||
}
|
||||
|
||||
// last literals
|
||||
final int literalLen = end - anchor;
|
||||
assert literalLen >= LAST_LITERALS || literalLen == len;
|
||||
encodeLastLiterals(bytes, anchor, end - anchor, out);
|
||||
}
|
||||
|
||||
private static class Match {
|
||||
int start, ref, len;
|
||||
|
||||
void fix(int correction) {
|
||||
start += correction;
|
||||
ref += correction;
|
||||
len -= correction;
|
||||
}
|
||||
|
||||
int end() {
|
||||
return start + len;
|
||||
}
|
||||
}
|
||||
|
||||
private static void copyTo(Match m1, Match m2) {
|
||||
m2.len = m1.len;
|
||||
m2.start = m1.start;
|
||||
m2.ref = m1.ref;
|
||||
}
|
||||
|
||||
private static class HashTable {
|
||||
static final int MAX_ATTEMPTS = 256;
|
||||
static final int MASK = MAX_DISTANCE - 1;
|
||||
int nextToUpdate;
|
||||
private final int base;
|
||||
private final int[] hashTable;
|
||||
private final short[] chainTable;
|
||||
|
||||
HashTable(int base) {
|
||||
this.base = base;
|
||||
nextToUpdate = base;
|
||||
hashTable = new int[HASH_TABLE_SIZE_HC];
|
||||
Arrays.fill(hashTable, -1);
|
||||
chainTable = new short[MAX_DISTANCE];
|
||||
}
|
||||
|
||||
private int hashPointer(byte[] bytes, int off) {
|
||||
final int v = readInt(bytes, off);
|
||||
final int h = hashHC(v);
|
||||
return base + hashTable[h];
|
||||
}
|
||||
|
||||
private int next(int off) {
|
||||
return base + off - (chainTable[off & MASK] & 0xFFFF);
|
||||
}
|
||||
|
||||
private void addHash(byte[] bytes, int off) {
|
||||
final int v = readInt(bytes, off);
|
||||
final int h = hashHC(v);
|
||||
int delta = off - hashTable[h];
|
||||
if (delta >= MAX_DISTANCE) {
|
||||
delta = MAX_DISTANCE - 1;
|
||||
}
|
||||
chainTable[off & MASK] = (short) delta;
|
||||
hashTable[h] = off - base;
|
||||
}
|
||||
|
||||
void insert(int off, byte[] bytes) {
|
||||
for (; nextToUpdate < off; ++nextToUpdate) {
|
||||
addHash(bytes, nextToUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
|
||||
match.start = off;
|
||||
match.len = 0;
|
||||
|
||||
insert(off, buf);
|
||||
|
||||
int ref = hashPointer(buf, off);
|
||||
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
|
||||
break;
|
||||
}
|
||||
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
|
||||
final int matchLen = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
|
||||
if (matchLen > match.len) {
|
||||
match.ref = ref;
|
||||
match.len = matchLen;
|
||||
}
|
||||
}
|
||||
ref = next(ref);
|
||||
}
|
||||
|
||||
return match.len != 0;
|
||||
}
|
||||
|
||||
boolean insertAndFindWiderMatch(byte[] buf, int off, int startLimit, int matchLimit, int minLen, Match match) {
|
||||
match.len = minLen;
|
||||
|
||||
insert(off, buf);
|
||||
|
||||
final int delta = off - startLimit;
|
||||
int ref = hashPointer(buf, off);
|
||||
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
|
||||
break;
|
||||
}
|
||||
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
|
||||
&& readIntEquals(buf, ref, off)) {
|
||||
final int matchLenForward = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
|
||||
final int matchLenBackward = commonBytesBackward(buf, ref, off, base, startLimit);
|
||||
final int matchLen = matchLenBackward + matchLenForward;
|
||||
if (matchLen > match.len) {
|
||||
match.len = matchLen;
|
||||
match.ref = ref - matchLenBackward;
|
||||
match.start = off - matchLenBackward;
|
||||
}
|
||||
}
|
||||
ref = next(ref);
|
||||
}
|
||||
|
||||
return match.len > minLen;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void compressHC(byte[] src, int srcOff, int srcLen, DataOutput out) throws IOException {
|
||||
|
||||
final int srcEnd = srcOff + srcLen;
|
||||
final int matchLimit = srcEnd - LAST_LITERALS;
|
||||
|
||||
int sOff = srcOff;
|
||||
int anchor = sOff++;
|
||||
|
||||
final HashTable ht = new HashTable(srcOff);
|
||||
final Match match0 = new Match();
|
||||
final Match match1 = new Match();
|
||||
final Match match2 = new Match();
|
||||
final Match match3 = new Match();
|
||||
|
||||
main:
|
||||
while (sOff < matchLimit) {
|
||||
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
|
||||
++sOff;
|
||||
continue;
|
||||
}
|
||||
|
||||
// saved, in case we would skip too much
|
||||
copyTo(match1, match0);
|
||||
|
||||
search2:
|
||||
while (true) {
|
||||
assert match1.start >= anchor;
|
||||
if (match1.end() >= matchLimit
|
||||
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
|
||||
// no better match
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
anchor = sOff = match1.end();
|
||||
continue main;
|
||||
}
|
||||
|
||||
if (match0.start < match1.start) {
|
||||
if (match2.start < match1.start + match0.len) { // empirical
|
||||
copyTo(match0, match1);
|
||||
}
|
||||
}
|
||||
assert match2.start > match1.start;
|
||||
|
||||
if (match2.start - match1.start < 3) { // First Match too small : removed
|
||||
copyTo(match2, match1);
|
||||
continue search2;
|
||||
}
|
||||
|
||||
search3:
|
||||
while (true) {
|
||||
if (match2.start - match1.start < OPTIMAL_ML) {
|
||||
int newMatchLen = match1.len;
|
||||
if (newMatchLen > OPTIMAL_ML) {
|
||||
newMatchLen = OPTIMAL_ML;
|
||||
}
|
||||
if (match1.start + newMatchLen > match2.end() - MIN_MATCH) {
|
||||
newMatchLen = match2.start - match1.start + match2.len - MIN_MATCH;
|
||||
}
|
||||
final int correction = newMatchLen - (match2.start - match1.start);
|
||||
if (correction > 0) {
|
||||
match2.fix(correction);
|
||||
}
|
||||
}
|
||||
|
||||
if (match2.start + match2.len >= matchLimit
|
||||
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
|
||||
// no better match -> 2 sequences to encode
|
||||
if (match2.start < match1.end()) {
|
||||
if (match2.start - match1.start < OPTIMAL_ML) {
|
||||
if (match1.len > OPTIMAL_ML) {
|
||||
match1.len = OPTIMAL_ML;
|
||||
}
|
||||
if (match1.end() > match2.end() - MIN_MATCH) {
|
||||
match1.len = match2.end() - match1.start - MIN_MATCH;
|
||||
}
|
||||
final int correction = match1.len - (match2.start - match1.start);
|
||||
if (correction > 0) {
|
||||
match2.fix(correction);
|
||||
}
|
||||
} else {
|
||||
match1.len = match2.start - match1.start;
|
||||
}
|
||||
}
|
||||
// encode seq 1
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
anchor = sOff = match1.end();
|
||||
// encode seq 2
|
||||
encodeSequence(src, anchor, match2.ref, match2.start, match2.len, out);
|
||||
anchor = sOff = match2.end();
|
||||
continue main;
|
||||
}
|
||||
|
||||
if (match3.start < match1.end() + 3) { // Not enough space for match 2 : remove it
|
||||
if (match3.start >= match1.end()) { // // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
|
||||
if (match2.start < match1.end()) {
|
||||
final int correction = match1.end() - match2.start;
|
||||
match2.fix(correction);
|
||||
if (match2.len < MIN_MATCH) {
|
||||
copyTo(match3, match2);
|
||||
}
|
||||
}
|
||||
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
anchor = sOff = match1.end();
|
||||
|
||||
copyTo(match3, match1);
|
||||
copyTo(match2, match0);
|
||||
|
||||
continue search2;
|
||||
}
|
||||
|
||||
copyTo(match3, match2);
|
||||
continue search3;
|
||||
}
|
||||
|
||||
// OK, now we have 3 ascending matches; let's write at least the first one
|
||||
if (match2.start < match1.end()) {
|
||||
if (match2.start - match1.start < 0x0F) {
|
||||
if (match1.len > OPTIMAL_ML) {
|
||||
match1.len = OPTIMAL_ML;
|
||||
}
|
||||
if (match1.end() > match2.end() - MIN_MATCH) {
|
||||
match1.len = match2.end() - match1.start - MIN_MATCH;
|
||||
}
|
||||
final int correction = match1.end() - match2.start;
|
||||
match2.fix(correction);
|
||||
} else {
|
||||
match1.len = match2.start - match1.start;
|
||||
}
|
||||
}
|
||||
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
anchor = sOff = match1.end();
|
||||
|
||||
copyTo(match2, match1);
|
||||
copyTo(match3, match2);
|
||||
|
||||
continue search3;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
encodeLastLiterals(src, anchor, srcEnd - anchor, out);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* An uncompressor.
|
||||
*/
|
||||
abstract class Uncompressor implements Cloneable {
|
||||
|
||||
/**
|
||||
* Uncompress bytes. This method is free to resize <code>bytes</code> in case
|
||||
* it is too small to hold all the uncompressed data.
|
||||
*/
|
||||
public abstract void uncompress(DataInput in, BytesRef bytes) throws IOException;
|
||||
|
||||
/**
|
||||
* Method to use if you are only interested into <code>length</code>
|
||||
* uncompressed bytes starting at offset <code>offset</code>. Some compression
|
||||
* codecs might have optimizations for this special case.
|
||||
*/
|
||||
public void uncompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
|
||||
uncompress(in, bytes);
|
||||
if (bytes.length < offset + length) {
|
||||
throw new IndexOutOfBoundsException((offset + length) + " > " + bytes.length);
|
||||
}
|
||||
bytes.offset += offset;
|
||||
bytes.length = length;
|
||||
}
|
||||
|
||||
public abstract void copyCompressedData(DataInput in, DataOutput out) throws IOException;
|
||||
|
||||
@Override
|
||||
public abstract Uncompressor clone();
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
StoredFieldsFormat that allows cross-document and cross-field compression of stored fields.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
public abstract class AbstractTestCompressionMode extends LuceneTestCase {
|
||||
|
||||
CompressionMode mode;
|
||||
|
||||
static byte[] randomArray() {
|
||||
final int max = random().nextBoolean()
|
||||
? random().nextInt(4)
|
||||
: random().nextInt(256);
|
||||
final int length = random().nextBoolean()
|
||||
? random().nextInt(20)
|
||||
: random().nextInt(192 * 1024);
|
||||
final byte[] arr = new byte[length];
|
||||
for (int i = 0; i < arr.length; ++i) {
|
||||
arr[i] = (byte) RandomInts.randomIntBetween(random(), 0, max);
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
|
||||
byte[] compress(byte[] uncompressed) throws IOException {
|
||||
Compressor compressor = mode.newCompressor();
|
||||
return compress(compressor, uncompressed);
|
||||
}
|
||||
|
||||
static byte[] compress(Compressor compressor, byte[] uncompressed) throws IOException {
|
||||
byte[] compressed = new byte[uncompressed.length * 2 + 16]; // should be enough
|
||||
ByteArrayDataOutput out = new ByteArrayDataOutput(compressed);
|
||||
compressor.compress(uncompressed, 0, uncompressed.length, out);
|
||||
final int compressedLen = out.getPosition();
|
||||
return Arrays.copyOf(compressed, compressedLen);
|
||||
}
|
||||
|
||||
byte[] uncompress(byte[] compressed) throws IOException {
|
||||
Uncompressor uncompressor = mode.newUncompressor();
|
||||
return uncompress(uncompressor, compressed);
|
||||
}
|
||||
|
||||
static byte[] uncompress(Uncompressor uncompressor, byte[] compressed) throws IOException {
|
||||
final BytesRef bytes = new BytesRef();
|
||||
uncompressor.uncompress(new ByteArrayDataInput(compressed), bytes);
|
||||
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
|
||||
}
|
||||
|
||||
byte[] uncompress(byte[] compressed, int offset, int length) throws IOException {
|
||||
Uncompressor uncompressor = mode.newUncompressor();
|
||||
final BytesRef bytes = new BytesRef();
|
||||
uncompressor.uncompress(new ByteArrayDataInput(compressed), offset, length, bytes);
|
||||
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
|
||||
}
|
||||
|
||||
public void testUncompress() throws IOException {
|
||||
final byte[] uncompressed = randomArray();
|
||||
final byte[] compressed = compress(uncompressed);
|
||||
final byte[] restored = uncompress(compressed);
|
||||
assertArrayEquals(uncompressed, restored);
|
||||
}
|
||||
|
||||
public void testPartialUncompress() throws IOException {
|
||||
final int iterations = atLeast(10);
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final byte[] uncompressed = randomArray();
|
||||
final byte[] compressed = compress(uncompressed);
|
||||
final int offset, length;
|
||||
if (uncompressed.length == 0) {
|
||||
offset = length = 0;
|
||||
} else {
|
||||
offset = random().nextInt(uncompressed.length);
|
||||
length = random().nextInt(uncompressed.length - offset);
|
||||
}
|
||||
final byte[] restored = uncompress(compressed, offset, length);
|
||||
assertArrayEquals(Arrays.copyOfRange(uncompressed, offset, offset + length), restored);
|
||||
}
|
||||
}
|
||||
|
||||
public void testCopyCompressedData() throws IOException {
|
||||
final byte[] uncompressed = randomArray();
|
||||
final byte[] compressed = compress(uncompressed);
|
||||
GrowableByteArrayDataOutput out = new GrowableByteArrayDataOutput(uncompressed.length);
|
||||
mode.newUncompressor().copyCompressedData(new ByteArrayDataInput(compressed), out);
|
||||
assertArrayEquals(compressed, Arrays.copyOf(out.bytes, out.length));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.DoubleField;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.FloatField;
|
||||
import org.apache.lucene.document.IntField;
|
||||
import org.apache.lucene.document.LongField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.search.NumericRangeQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
public class TestCompressingStoredFieldsFormat extends LuceneTestCase {
|
||||
|
||||
public void testWriteReadMerge() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
|
||||
iwConf.setCodec(CompressingCodec.randomInstance(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
|
||||
final int docCount = atLeast(200);
|
||||
final byte[][][] data = new byte [docCount][][];
|
||||
for (int i = 0; i < docCount; ++i) {
|
||||
final int fieldCount = rarely()
|
||||
? RandomInts.randomIntBetween(random(), 1, 500)
|
||||
: RandomInts.randomIntBetween(random(), 1, 5);
|
||||
data[i] = new byte[fieldCount][];
|
||||
for (int j = 0; j < fieldCount; ++j) {
|
||||
final int length = rarely()
|
||||
? random().nextInt(1000)
|
||||
: random().nextInt(10);
|
||||
final byte[] arr = new byte[length];
|
||||
final int max = rarely() ? 256 : 2;
|
||||
for (int k = 0; k < length; ++k) {
|
||||
arr[k] = (byte) random().nextInt(max);
|
||||
}
|
||||
data[i][j] = arr;
|
||||
}
|
||||
}
|
||||
|
||||
final FieldType type = new FieldType(StringField.TYPE_STORED);
|
||||
type.setIndexed(false);
|
||||
type.freeze();
|
||||
IntField id = new IntField("id", 0, Store.YES);
|
||||
for (int i = 0; i < data.length; ++i) {
|
||||
Document doc = new Document();
|
||||
doc.add(id);
|
||||
id.setIntValue(i);
|
||||
for (int j = 0; j < data[i].length; ++j) {
|
||||
Field f = new Field("bytes" + j, data[i][j], type);
|
||||
doc.add(f);
|
||||
}
|
||||
iw.w.addDocument(doc);
|
||||
if (random().nextBoolean() && (i % (data.length / 10) == 0)) {
|
||||
iw.w.close();
|
||||
// switch codecs
|
||||
if (iwConf.getCodec() instanceof Lucene40Codec) {
|
||||
iwConf.setCodec(CompressingCodec.randomInstance(random()));
|
||||
} else {
|
||||
iwConf.setCodec(new Lucene40Codec());
|
||||
}
|
||||
iw = new RandomIndexWriter(random(), dir, iwConf);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
final int min = random().nextInt(data.length);
|
||||
final int max = min + random().nextInt(20);
|
||||
iw.deleteDocuments(NumericRangeQuery.newIntRange("id", min, max, true, false));
|
||||
}
|
||||
|
||||
iw.forceMerge(2); // force merges with deletions
|
||||
|
||||
iw.commit();
|
||||
|
||||
final DirectoryReader ir = DirectoryReader.open(dir);
|
||||
assertTrue(ir.numDocs() > 0);
|
||||
int numDocs = 0;
|
||||
for (int i = 0; i < ir.maxDoc(); ++i) {
|
||||
final StoredDocument doc = ir.document(i);
|
||||
if (doc == null) {
|
||||
continue;
|
||||
}
|
||||
++ numDocs;
|
||||
final int docId = doc.getField("id").numericValue().intValue();
|
||||
assertEquals(data[docId].length + 1, doc.getFields().size());
|
||||
for (int j = 0; j < data[docId].length; ++j) {
|
||||
final byte[] arr = data[docId][j];
|
||||
final BytesRef arr2Ref = doc.getBinaryValue("bytes" + j);
|
||||
final byte[] arr2 = Arrays.copyOfRange(arr2Ref.bytes, arr2Ref.offset, arr2Ref.offset + arr2Ref.length);
|
||||
assertArrayEquals(arr, arr2);
|
||||
}
|
||||
}
|
||||
assertTrue(ir.numDocs() <= numDocs);
|
||||
ir.close();
|
||||
|
||||
iw.deleteAll();
|
||||
iw.commit();
|
||||
iw.forceMerge(1);
|
||||
iw.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testReadSkip() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
|
||||
iwConf.setCodec(CompressingCodec.randomInstance(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
|
||||
|
||||
FieldType ft = new FieldType();
|
||||
ft.setStored(true);
|
||||
ft.freeze();
|
||||
|
||||
final String string = _TestUtil.randomSimpleString(random(), 50);
|
||||
final byte[] bytes = string.getBytes("UTF-8");
|
||||
final long l = random().nextBoolean() ? random().nextInt(42) : random().nextLong();
|
||||
final int i = random().nextBoolean() ? random().nextInt(42) : random().nextInt();
|
||||
final float f = random().nextFloat();
|
||||
final double d = random().nextDouble();
|
||||
|
||||
List<Field> fields = Arrays.asList(
|
||||
new Field("bytes", bytes, ft),
|
||||
new Field("string", string, ft),
|
||||
new LongField("long", l, Store.YES),
|
||||
new IntField("int", i, Store.YES),
|
||||
new FloatField("float", f, Store.YES),
|
||||
new DoubleField("double", d, Store.YES)
|
||||
);
|
||||
|
||||
for (int k = 0; k < 100; ++k) {
|
||||
Document doc = new Document();
|
||||
for (Field fld : fields) {
|
||||
doc.add(fld);
|
||||
}
|
||||
iw.w.addDocument(doc);
|
||||
}
|
||||
iw.close();
|
||||
|
||||
final DirectoryReader reader = DirectoryReader.open(dir);
|
||||
final int docID = random().nextInt(100);
|
||||
for (Field fld : fields) {
|
||||
String fldName = fld.name();
|
||||
final StoredDocument sDoc = reader.document(docID, Collections.singleton(fldName));
|
||||
final StorableField sField = sDoc.getField(fldName);
|
||||
if (Field.class.equals(fld.getClass())) {
|
||||
assertEquals(fld.binaryValue(), sField.binaryValue());
|
||||
assertEquals(fld.stringValue(), sField.stringValue());
|
||||
} else {
|
||||
assertEquals(fld.numericValue(), sField.numericValue());
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestFastCompressionMode extends AbstractTestCompressionMode {
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
mode = CompressionMode.FAST;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
public class TestFastUncompressionMode extends AbstractTestCompressionMode {
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
mode = CompressionMode.FAST_UNCOMPRESSION;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
public class TestHighCompressionMode extends AbstractTestCompressionMode {
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
mode = CompressionMode.HIGH_COMPRESSION;
|
||||
}
|
||||
}
|
|
@ -37,7 +37,7 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
|
|||
this.format = format;
|
||||
bulkOperation = BulkOperation.of(format, bitsPerValue);
|
||||
iterations = bulkOperation.computeIterations(valueCount, mem);
|
||||
assert iterations > 0;
|
||||
assert valueCount == 0 || iterations > 0;
|
||||
nextBlocks = new long[iterations * bulkOperation.blockCount()];
|
||||
nextValues = new LongsRef(new long[iterations * bulkOperation.valueCount()], 0, 0);
|
||||
assert iterations * bulkOperation.valueCount() == nextValues.longs.length;
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
/**
|
||||
* A codec that uses {@link CompressingStoredFieldsFormat} for its stored
|
||||
* fields and delegates to {@link Lucene40Codec} for everything else.
|
||||
*/
|
||||
public class CompressingCodec extends FilterCodec {
|
||||
|
||||
/**
|
||||
* Create a random instance.
|
||||
*/
|
||||
public static CompressingCodec randomInstance(Random random) {
|
||||
final CompressionMode mode = RandomPicks.randomFrom(random, CompressionMode.values());
|
||||
final int chunkSize = RandomInts.randomIntBetween(random, 1, 500);
|
||||
final CompressingStoredFieldsIndex index = RandomPicks.randomFrom(random, CompressingStoredFieldsIndex.values());
|
||||
return new CompressingCodec(mode, chunkSize, index);
|
||||
}
|
||||
|
||||
private final CompressingStoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(CompressionMode, int, CompressingStoredFieldsIndex)
|
||||
*/
|
||||
public CompressingCodec(CompressionMode compressionMode, int chunkSize,
|
||||
CompressingStoredFieldsIndex storedFieldsIndexFormat) {
|
||||
super("Compressing", new Lucene40Codec());
|
||||
this.storedFieldsFormat = new CompressingStoredFieldsFormat(compressionMode, chunkSize, storedFieldsIndexFormat);
|
||||
}
|
||||
|
||||
public CompressingCodec() {
|
||||
this(CompressionMode.FAST, 1 << 14, CompressingStoredFieldsIndex.MEMORY_CHUNK);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Support for testing {@link org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat}.
|
||||
</body>
|
||||
</html>
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.appending.AppendingCodec;
|
||||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.codecs.compressing.CompressingCodec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
|
@ -165,6 +166,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
codec = new AppendingCodec();
|
||||
} else if ("Asserting".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 7 && !shouldAvoidCodec("Asserting"))) {
|
||||
codec = new AssertingCodec();
|
||||
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
||||
codec = CompressingCodec.randomInstance(random);
|
||||
} else if (!"random".equals(TEST_CODEC)) {
|
||||
codec = Codec.forName(TEST_CODEC);
|
||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
|
|
|
@ -14,3 +14,4 @@
|
|||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.asserting.AssertingCodec
|
||||
org.apache.lucene.codecs.compressing.CompressingCodec
|
||||
|
|
Loading…
Reference in New Issue