mirror of https://github.com/apache/lucene.git
LUCENE-9486: Use preset dictionaries with LZ4 for BEST_SPEED. (#1793)
This commit is contained in:
parent
22abecdbc1
commit
27aa5c5f59
|
@ -206,8 +206,8 @@ Optimizations
|
|||
* LUCENE-9395: ConstantValuesSource now shares a single DoubleValues
|
||||
instance across all segments (Tony Xu)
|
||||
|
||||
* LUCENE-9447: BEST_COMPRESSION now provides higher compression ratios on highly
|
||||
compressible data. (Adrien Grand)
|
||||
* LUCENE-9447, LUCENE-9486: Stored fields now get higer compression ratios on
|
||||
highly compressible data. (Adrien Grand)
|
||||
|
||||
* LUCENE-9373: FunctionMatchQuery now accepts a "matchCost" optimization hint.
|
||||
(Maxim Glazkov, David Smiley)
|
||||
|
|
|
@ -48,7 +48,7 @@ enum CompressionAlgorithm {
|
|||
|
||||
@Override
|
||||
void read(DataInput in, byte[] out, int len) throws IOException {
|
||||
org.apache.lucene.util.compress.LZ4.decompress(in, len, out);
|
||||
org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0);
|
||||
}
|
||||
|
||||
};
|
||||
|
|
|
@ -136,7 +136,7 @@ public abstract class CompressionMode {
|
|||
if (bytes.bytes.length < originalLength + 7) {
|
||||
bytes.bytes = new byte[ArrayUtil.oversize(originalLength + 7, 1)];
|
||||
}
|
||||
final int decompressedLength = LZ4.decompress(in, offset + length, bytes.bytes);
|
||||
final int decompressedLength = LZ4.decompress(in, offset + length, bytes.bytes, 0);
|
||||
if (decompressedLength > originalLength) {
|
||||
throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength, in);
|
||||
}
|
||||
|
|
|
@ -832,7 +832,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
|
||||
assert uncompressedBlockLength <= uncompressedBlock.length;
|
||||
LZ4.decompress(compressedData, uncompressedBlockLength, uncompressedBlock);
|
||||
LZ4.decompress(compressedData, uncompressedBlockLength, uncompressedBlock, 0);
|
||||
}
|
||||
|
||||
uncompressedBytesRef.offset = uncompressedDocStarts[docInBlockId];
|
||||
|
|
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene87;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.codecs.compressing.Compressor;
|
||||
import org.apache.lucene.codecs.compressing.Decompressor;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A compression mode that trades speed for compression ratio. Although
|
||||
* compression and decompression might be slow, this compression mode should
|
||||
* provide a good compression ratio. This mode might be interesting if/when
|
||||
* your index size is much bigger than your OS cache.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class DeflateWithPresetDictCompressionMode extends CompressionMode {
|
||||
|
||||
private final int dictLength, subBlockLength;
|
||||
|
||||
/** Sole constructor. */
|
||||
public DeflateWithPresetDictCompressionMode(int dictLength, int subBlockLength) {
|
||||
this.dictLength = dictLength;
|
||||
this.subBlockLength = subBlockLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Compressor newCompressor() {
|
||||
// notes:
|
||||
// 3 is the highest level that doesn't have lazy match evaluation
|
||||
// 6 is the default, higher than that is just a waste of cpu
|
||||
return new DeflateWithPresetDictCompressor(6, dictLength, subBlockLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor newDecompressor() {
|
||||
return new DeflateWithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BEST_COMPRESSION";
|
||||
}
|
||||
|
||||
private static final class DeflateWithPresetDictDecompressor extends Decompressor {
|
||||
|
||||
byte[] compressed;
|
||||
|
||||
DeflateWithPresetDictDecompressor() {
|
||||
compressed = new byte[0];
|
||||
}
|
||||
|
||||
private void doDecompress(DataInput in, Inflater decompressor, BytesRef bytes) throws IOException {
|
||||
final int compressedLength = in.readVInt();
|
||||
if (compressedLength == 0) {
|
||||
return;
|
||||
}
|
||||
// pad with extra "dummy byte": see javadocs for using Inflater(true)
|
||||
// we do it for compliance, but it's unnecessary for years in zlib.
|
||||
final int paddedLength = compressedLength + 1;
|
||||
compressed = ArrayUtil.grow(compressed, paddedLength);
|
||||
in.readBytes(compressed, 0, compressedLength);
|
||||
compressed[compressedLength] = 0; // explicitly set dummy byte to 0
|
||||
|
||||
// extra "dummy byte"
|
||||
decompressor.setInput(compressed, 0, paddedLength);
|
||||
try {
|
||||
bytes.length += decompressor.inflate(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
|
||||
} catch (DataFormatException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
if (decompressor.finished() == false) {
|
||||
throw new CorruptIndexException("Invalid decoder state: needsInput=" + decompressor.needsInput()
|
||||
+ ", needsDict=" + decompressor.needsDictionary(), in);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException {
|
||||
assert offset + length <= originalLength;
|
||||
if (length == 0) {
|
||||
bytes.length = 0;
|
||||
return;
|
||||
}
|
||||
final int dictLength = in.readVInt();
|
||||
final int blockLength = in.readVInt();
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength);
|
||||
bytes.offset = bytes.length = 0;
|
||||
|
||||
final Inflater decompressor = new Inflater(true);
|
||||
try {
|
||||
// Read the dictionary
|
||||
doDecompress(in, decompressor, bytes);
|
||||
if (dictLength != bytes.length) {
|
||||
throw new CorruptIndexException("Unexpected dict length", in);
|
||||
}
|
||||
|
||||
int offsetInBlock = dictLength;
|
||||
int offsetInBytesRef = offset;
|
||||
|
||||
// Skip unneeded blocks
|
||||
while (offsetInBlock + blockLength < offset) {
|
||||
final int compressedLength = in.readVInt();
|
||||
in.skipBytes(compressedLength);
|
||||
offsetInBlock += blockLength;
|
||||
offsetInBytesRef -= blockLength;
|
||||
}
|
||||
|
||||
// Read blocks that intersect with the interval we need
|
||||
while (offsetInBlock < offset + length) {
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + blockLength);
|
||||
decompressor.reset();
|
||||
decompressor.setDictionary(bytes.bytes, 0, dictLength);
|
||||
doDecompress(in, decompressor, bytes);
|
||||
offsetInBlock += blockLength;
|
||||
}
|
||||
|
||||
bytes.offset = offsetInBytesRef;
|
||||
bytes.length = length;
|
||||
assert bytes.isValid();
|
||||
} finally {
|
||||
decompressor.end();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor clone() {
|
||||
return new DeflateWithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class DeflateWithPresetDictCompressor extends Compressor {
|
||||
|
||||
final byte[] dictBytes;
|
||||
final int blockLength;
|
||||
final Deflater compressor;
|
||||
byte[] compressed;
|
||||
boolean closed;
|
||||
|
||||
DeflateWithPresetDictCompressor(int level, int dictLength, int blockLength) {
|
||||
compressor = new Deflater(level, true);
|
||||
compressed = new byte[64];
|
||||
this.dictBytes = new byte[dictLength];
|
||||
this.blockLength = blockLength;
|
||||
}
|
||||
|
||||
private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
if (len == 0) {
|
||||
out.writeVInt(0);
|
||||
return;
|
||||
}
|
||||
compressor.setInput(bytes, off, len);
|
||||
compressor.finish();
|
||||
if (compressor.needsInput()) {
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
|
||||
int totalCount = 0;
|
||||
for (;;) {
|
||||
final int count = compressor.deflate(compressed, totalCount, compressed.length - totalCount);
|
||||
totalCount += count;
|
||||
assert totalCount <= compressed.length;
|
||||
if (compressor.finished()) {
|
||||
break;
|
||||
} else {
|
||||
compressed = ArrayUtil.grow(compressed);
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(totalCount);
|
||||
out.writeBytes(compressed, totalCount);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
final int dictLength = Math.min(dictBytes.length, len);
|
||||
System.arraycopy(bytes, off, dictBytes, 0, dictLength);
|
||||
out.writeVInt(dictLength);
|
||||
out.writeVInt(blockLength);
|
||||
final int end = off + len;
|
||||
|
||||
// Compress the dictionary first
|
||||
compressor.reset();
|
||||
doCompress(bytes, off, dictLength, out);
|
||||
|
||||
// And then sub blocks
|
||||
for (int start = off + dictLength; start < end; start += blockLength) {
|
||||
compressor.reset();
|
||||
// NOTE: offset MUST be 0 when setting the dictionary in order to work around JDK-8252739
|
||||
compressor.setDictionary(dictBytes, 0, dictLength);
|
||||
doCompress(bytes, start, Math.min(blockLength, off + len - start), out);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (closed == false) {
|
||||
compressor.end();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene87;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.codecs.compressing.Compressor;
|
||||
import org.apache.lucene.codecs.compressing.Decompressor;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.compress.LZ4;
|
||||
|
||||
/**
|
||||
* A compression mode that compromises on the compression ratio to provide
|
||||
* fast compression and decompression.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
|
||||
|
||||
private final int dictLength, subBlockLength;
|
||||
|
||||
/** Sole constructor. */
|
||||
public LZ4WithPresetDictCompressionMode(int dictLength, int subBlockLength) {
|
||||
this.dictLength = dictLength;
|
||||
this.subBlockLength = subBlockLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Compressor newCompressor() {
|
||||
return new LZ4WithPresetDictCompressor(dictLength, subBlockLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor newDecompressor() {
|
||||
return new LZ4WithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BEST_SPEED";
|
||||
}
|
||||
|
||||
private static final class LZ4WithPresetDictDecompressor extends Decompressor {
|
||||
|
||||
private int[] compressedLengths;
|
||||
private byte[] buffer;
|
||||
|
||||
LZ4WithPresetDictDecompressor() {
|
||||
compressedLengths = new int[0];
|
||||
buffer = new byte[0];
|
||||
}
|
||||
|
||||
private int readCompressedLengths(DataInput in, int originalLength, int dictLength, int blockLength) throws IOException {
|
||||
in.readVInt(); // compressed length of the dictionary, unused
|
||||
int totalLength = dictLength;
|
||||
int i = 0;
|
||||
while (totalLength < originalLength) {
|
||||
compressedLengths = ArrayUtil.grow(compressedLengths, i+1);
|
||||
compressedLengths[i++] = in.readVInt();
|
||||
totalLength += blockLength;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException {
|
||||
assert offset + length <= originalLength;
|
||||
|
||||
if (length == 0) {
|
||||
bytes.length = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
final int dictLength = in.readVInt();
|
||||
final int blockLength = in.readVInt();
|
||||
|
||||
final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength);
|
||||
|
||||
buffer = ArrayUtil.grow(buffer, dictLength + blockLength);
|
||||
bytes.length = 0;
|
||||
// Read the dictionary
|
||||
if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) {
|
||||
throw new CorruptIndexException("Illegal dict length", in);
|
||||
}
|
||||
|
||||
int offsetInBlock = dictLength;
|
||||
int offsetInBytesRef = offset;
|
||||
if (offset >= dictLength) {
|
||||
offsetInBytesRef -= dictLength;
|
||||
|
||||
// Skip unneeded blocks
|
||||
int numBytesToSkip = 0;
|
||||
for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) {
|
||||
int compressedBlockLength = compressedLengths[i];
|
||||
numBytesToSkip += compressedBlockLength;
|
||||
offsetInBlock += blockLength;
|
||||
offsetInBytesRef -= blockLength;
|
||||
}
|
||||
in.skipBytes(numBytesToSkip);
|
||||
} else {
|
||||
// The dictionary contains some bytes we need, copy its content to the BytesRef
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength);
|
||||
System.arraycopy(buffer, 0, bytes.bytes, 0, dictLength);
|
||||
bytes.length = dictLength;
|
||||
}
|
||||
|
||||
// Read blocks that intersect with the interval we need
|
||||
while (offsetInBlock < offset + length) {
|
||||
final int bytesToDecompress = Math.min(blockLength, offset + length - offsetInBlock);
|
||||
LZ4.decompress(in, bytesToDecompress, buffer, dictLength);
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + bytesToDecompress);
|
||||
System.arraycopy(buffer, dictLength, bytes.bytes, bytes.length, bytesToDecompress);
|
||||
bytes.length += bytesToDecompress;
|
||||
offsetInBlock += blockLength;
|
||||
}
|
||||
|
||||
bytes.offset = offsetInBytesRef;
|
||||
bytes.length = length;
|
||||
assert bytes.isValid();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor clone() {
|
||||
return new LZ4WithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class LZ4WithPresetDictCompressor extends Compressor {
|
||||
|
||||
final int dictLength;
|
||||
final int blockLength;
|
||||
final ByteBuffersDataOutput compressed;
|
||||
final LZ4.FastCompressionHashTable hashTable;
|
||||
final byte[] buffer;
|
||||
|
||||
LZ4WithPresetDictCompressor(int dictLength, int blockLength) {
|
||||
compressed = ByteBuffersDataOutput.newResettableInstance();
|
||||
hashTable = new LZ4.FastCompressionHashTable();
|
||||
this.dictLength = dictLength;
|
||||
this.blockLength = blockLength;
|
||||
buffer = new byte[dictLength + blockLength];
|
||||
}
|
||||
|
||||
private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) throws IOException {
|
||||
long prevCompressedSize = compressed.size();
|
||||
LZ4.compressWithDictionary(bytes, 0, dictLen, len, compressed, hashTable);
|
||||
// Write the number of compressed bytes
|
||||
out.writeVInt(Math.toIntExact(compressed.size() - prevCompressedSize));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
final int dictLength = Math.min(this.dictLength, len);
|
||||
out.writeVInt(dictLength);
|
||||
out.writeVInt(blockLength);
|
||||
final int end = off + len;
|
||||
|
||||
compressed.reset();
|
||||
// Compress the dictionary first
|
||||
System.arraycopy(bytes, off, buffer, 0, dictLength);
|
||||
doCompress(buffer, 0, dictLength, out);
|
||||
|
||||
// And then sub blocks
|
||||
for (int start = off + dictLength; start < end; start += blockLength) {
|
||||
int l = Math.min(blockLength, off + len - start);
|
||||
System.arraycopy(bytes, start, buffer, dictLength, l);
|
||||
doCompress(buffer, dictLength, l, out);
|
||||
}
|
||||
|
||||
// We only wrote lengths so far, now write compressed data
|
||||
compressed.copyTo(out);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
// no-op
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,27 +18,17 @@ package org.apache.lucene.codecs.lucene87;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.codecs.compressing.Compressor;
|
||||
import org.apache.lucene.codecs.compressing.Decompressor;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
||||
|
||||
/**
|
||||
|
@ -154,7 +144,7 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
|
|||
StoredFieldsFormat impl(Mode mode) {
|
||||
switch (mode) {
|
||||
case BEST_SPEED:
|
||||
return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", CompressionMode.FAST, 16*1024, 128, 10);
|
||||
return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 512, 10);
|
||||
case BEST_COMPRESSION:
|
||||
return new CompressingStoredFieldsFormat("Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, 512, 10);
|
||||
default: throw new AssertionError();
|
||||
|
@ -179,202 +169,17 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
|
|||
private static final int BEST_COMPRESSION_BLOCK_LENGTH = BEST_COMPRESSION_DICT_LENGTH + 10 * BEST_COMPRESSION_SUB_BLOCK_LENGTH - 8 * 1024;
|
||||
|
||||
/** Compression mode for {@link Mode#BEST_COMPRESSION} */
|
||||
public static final DeflateWithPresetDict BEST_COMPRESSION_MODE = new DeflateWithPresetDict(BEST_COMPRESSION_DICT_LENGTH, BEST_COMPRESSION_SUB_BLOCK_LENGTH);
|
||||
public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode(BEST_COMPRESSION_DICT_LENGTH, BEST_COMPRESSION_SUB_BLOCK_LENGTH);
|
||||
|
||||
/**
|
||||
* A compression mode that trades speed for compression ratio. Although
|
||||
* compression and decompression might be slow, this compression mode should
|
||||
* provide a good compression ratio. This mode might be interesting if/when
|
||||
* your index size is much bigger than your OS cache.
|
||||
*/
|
||||
public static class DeflateWithPresetDict extends CompressionMode {
|
||||
// We need to re-initialize the hash table for every sub block with the
|
||||
// content of the dictionary, so we keep it small to not hurt indexing.
|
||||
private static final int BEST_SPEED_DICT_LENGTH = 4 * 1024;
|
||||
// 60kB so that dict_length + block_length == max window size
|
||||
private static final int BEST_SPEED_SUB_BLOCK_LENGTH = 60 * 1024;
|
||||
// shoot for 10 sub blocks in addition to the dictionary
|
||||
private static final int BEST_SPEED_BLOCK_LENGTH = BEST_SPEED_DICT_LENGTH + 10 * BEST_SPEED_SUB_BLOCK_LENGTH - 8 * 1024;
|
||||
|
||||
private final int dictLength, subBlockLength;
|
||||
|
||||
/** Sole constructor. */
|
||||
public DeflateWithPresetDict(int dictLength, int subBlockLength) {
|
||||
this.dictLength = dictLength;
|
||||
this.subBlockLength = subBlockLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Compressor newCompressor() {
|
||||
// notes:
|
||||
// 3 is the highest level that doesn't have lazy match evaluation
|
||||
// 6 is the default, higher than that is just a waste of cpu
|
||||
return new DeflateWithPresetDictCompressor(6, dictLength, subBlockLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor newDecompressor() {
|
||||
return new DeflateWithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BEST_COMPRESSION";
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
private static final class DeflateWithPresetDictDecompressor extends Decompressor {
|
||||
|
||||
byte[] compressed;
|
||||
|
||||
DeflateWithPresetDictDecompressor() {
|
||||
compressed = new byte[0];
|
||||
}
|
||||
|
||||
private void doDecompress(DataInput in, Inflater decompressor, BytesRef bytes) throws IOException {
|
||||
final int compressedLength = in.readVInt();
|
||||
if (compressedLength == 0) {
|
||||
return;
|
||||
}
|
||||
// pad with extra "dummy byte": see javadocs for using Inflater(true)
|
||||
// we do it for compliance, but it's unnecessary for years in zlib.
|
||||
final int paddedLength = compressedLength + 1;
|
||||
compressed = ArrayUtil.grow(compressed, paddedLength);
|
||||
in.readBytes(compressed, 0, compressedLength);
|
||||
compressed[compressedLength] = 0; // explicitly set dummy byte to 0
|
||||
|
||||
// extra "dummy byte"
|
||||
decompressor.setInput(compressed, 0, paddedLength);
|
||||
try {
|
||||
bytes.length += decompressor.inflate(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
|
||||
} catch (DataFormatException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
if (decompressor.finished() == false) {
|
||||
throw new CorruptIndexException("Invalid decoder state: needsInput=" + decompressor.needsInput()
|
||||
+ ", needsDict=" + decompressor.needsDictionary(), in);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException {
|
||||
assert offset + length <= originalLength;
|
||||
if (length == 0) {
|
||||
bytes.length = 0;
|
||||
return;
|
||||
}
|
||||
final int dictLength = in.readVInt();
|
||||
final int blockLength = in.readVInt();
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength);
|
||||
bytes.offset = bytes.length = 0;
|
||||
|
||||
final Inflater decompressor = new Inflater(true);
|
||||
try {
|
||||
// Read the dictionary
|
||||
doDecompress(in, decompressor, bytes);
|
||||
if (dictLength != bytes.length) {
|
||||
throw new CorruptIndexException("Unexpected dict length", in);
|
||||
}
|
||||
|
||||
int offsetInBlock = dictLength;
|
||||
int offsetInBytesRef = offset;
|
||||
|
||||
// Skip unneeded blocks
|
||||
while (offsetInBlock + blockLength < offset) {
|
||||
final int compressedLength = in.readVInt();
|
||||
in.skipBytes(compressedLength);
|
||||
offsetInBlock += blockLength;
|
||||
offsetInBytesRef -= blockLength;
|
||||
}
|
||||
|
||||
// Read blocks that intersect with the interval we need
|
||||
while (offsetInBlock < offset + length) {
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + blockLength);
|
||||
decompressor.reset();
|
||||
decompressor.setDictionary(bytes.bytes, 0, dictLength);
|
||||
doDecompress(in, decompressor, bytes);
|
||||
offsetInBlock += blockLength;
|
||||
}
|
||||
|
||||
bytes.offset = offsetInBytesRef;
|
||||
bytes.length = length;
|
||||
assert bytes.isValid();
|
||||
} finally {
|
||||
decompressor.end();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decompressor clone() {
|
||||
return new DeflateWithPresetDictDecompressor();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class DeflateWithPresetDictCompressor extends Compressor {
|
||||
|
||||
final byte[] dictBytes;
|
||||
final int blockLength;
|
||||
final Deflater compressor;
|
||||
byte[] compressed;
|
||||
boolean closed;
|
||||
|
||||
DeflateWithPresetDictCompressor(int level, int dictLength, int blockLength) {
|
||||
compressor = new Deflater(level, true);
|
||||
compressed = new byte[64];
|
||||
this.dictBytes = new byte[dictLength];
|
||||
this.blockLength = blockLength;
|
||||
}
|
||||
|
||||
private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
if (len == 0) {
|
||||
out.writeVInt(0);
|
||||
return;
|
||||
}
|
||||
compressor.setInput(bytes, off, len);
|
||||
compressor.finish();
|
||||
if (compressor.needsInput()) {
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
|
||||
int totalCount = 0;
|
||||
for (;;) {
|
||||
final int count = compressor.deflate(compressed, totalCount, compressed.length - totalCount);
|
||||
totalCount += count;
|
||||
assert totalCount <= compressed.length;
|
||||
if (compressor.finished()) {
|
||||
break;
|
||||
} else {
|
||||
compressed = ArrayUtil.grow(compressed);
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(totalCount);
|
||||
out.writeBytes(compressed, totalCount);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
final int dictLength = Math.min(dictBytes.length, len);
|
||||
System.arraycopy(bytes, off, dictBytes, 0, dictLength);
|
||||
out.writeVInt(dictLength);
|
||||
out.writeVInt(blockLength);
|
||||
final int end = off + len;
|
||||
|
||||
// Compress the dictionary first
|
||||
compressor.reset();
|
||||
doCompress(bytes, off, dictLength, out);
|
||||
|
||||
// And then sub blocks
|
||||
for (int start = off + dictLength; start < end; start += blockLength) {
|
||||
compressor.reset();
|
||||
// NOTE: offset MUST be 0 when setting the dictionary in order to work around JDK-8252739
|
||||
compressor.setDictionary(dictBytes, 0, dictLength);
|
||||
doCompress(bytes, start, Math.min(blockLength, off + len - start), out);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (closed == false) {
|
||||
compressor.end();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
/** Compression mode for {@link Mode#BEST_SPEED} */
|
||||
public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode(BEST_SPEED_DICT_LENGTH, BEST_SPEED_SUB_BLOCK_LENGTH);
|
||||
|
||||
}
|
||||
|
|
|
@ -55,7 +55,6 @@ public final class LZ4 {
|
|||
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
|
||||
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
|
||||
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
|
||||
static final int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
|
||||
|
||||
|
||||
private static int hash(int i, int hashBits) {
|
||||
|
@ -77,14 +76,15 @@ public final class LZ4 {
|
|||
}
|
||||
|
||||
/**
|
||||
* Decompress at least <code>decompressedLen</code> bytes into
|
||||
* <code>dest[dOff:]</code>. Please note that <code>dest</code> must be large
|
||||
* Decompress at least {@code decompressedLen} bytes into
|
||||
* {@code dest[dOff:]}. Please note that {@code dest} must be large
|
||||
* enough to be able to hold <b>all</b> decompressed data (meaning that you
|
||||
* need to know the total decompressed length).
|
||||
* If the given bytes were compressed using a preset dictionary then the same
|
||||
* dictionary must be provided in {@code dest[dOff-dictLen:dOff]}.
|
||||
*/
|
||||
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest) throws IOException {
|
||||
int dOff = 0;
|
||||
final int destEnd = dest.length;
|
||||
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException {
|
||||
final int destEnd = dOff + decompressedLen;
|
||||
|
||||
do {
|
||||
// literals
|
||||
|
@ -103,7 +103,7 @@ public final class LZ4 {
|
|||
dOff += literalLen;
|
||||
}
|
||||
|
||||
if (dOff >= decompressedLen) {
|
||||
if (dOff >= destEnd) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -133,7 +133,7 @@ public final class LZ4 {
|
|||
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
|
||||
dOff += matchLen;
|
||||
}
|
||||
} while (dOff < decompressedLen);
|
||||
} while (dOff < destEnd);
|
||||
|
||||
return dOff;
|
||||
}
|
||||
|
@ -190,6 +190,9 @@ public final class LZ4 {
|
|||
/** Reset this hash table in order to compress the given content. */
|
||||
abstract void reset(byte[] b, int off, int len);
|
||||
|
||||
/** Init {@code dictLen} bytes to be used as a dictionary. */
|
||||
abstract void initDictionary(int dictLen);
|
||||
|
||||
/**
|
||||
* Advance the cursor to {@off} and return an index that stored the same
|
||||
* 4 bytes as {@code b[o:o+4)}. This may only be called on strictly
|
||||
|
@ -229,7 +232,6 @@ public final class LZ4 {
|
|||
Objects.checkFromIndexSize(off, len, bytes.length);
|
||||
this.bytes = bytes;
|
||||
this.base = off;
|
||||
this.lastOff = off - 1;
|
||||
this.end = off + len;
|
||||
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
|
||||
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
|
||||
|
@ -239,8 +241,18 @@ public final class LZ4 {
|
|||
} else {
|
||||
// Avoid calling hashTable.clear(), this makes it costly to compress many short sequences otherwise.
|
||||
// Instead, get() checks that references are less than the current offset.
|
||||
get(off); // this sets the hashTable for the first 4 bytes as a side-effect
|
||||
}
|
||||
this.lastOff = off - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
void initDictionary(int dictLen) {
|
||||
for (int i = 0; i < dictLen; ++i) {
|
||||
final int v = readInt(bytes, base + i);
|
||||
final int h = hash(v, hashLog);
|
||||
hashTable.set(h, i);
|
||||
}
|
||||
lastOff += dictLen;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -327,9 +339,18 @@ public final class LZ4 {
|
|||
this.end = off + len;
|
||||
}
|
||||
|
||||
@Override
|
||||
void initDictionary(int dictLen) {
|
||||
assert next == base;
|
||||
for (int i = 0; i < dictLen; ++i) {
|
||||
addHash(base + i);
|
||||
}
|
||||
next += dictLen;
|
||||
}
|
||||
|
||||
@Override
|
||||
int get(int off) {
|
||||
assert off > next;
|
||||
assert off >= next;
|
||||
assert off < end;
|
||||
|
||||
for (; next < off; next++) {
|
||||
|
@ -389,23 +410,40 @@ public final class LZ4 {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compress <code>bytes[off:off+len]</code> into <code>out</code> using
|
||||
* at most 16KB of memory. <code>ht</code> shouldn't be shared across threads
|
||||
* but can safely be reused.
|
||||
* Compress {@code bytes[off:off+len]} into {@code out} using at most 16kB of
|
||||
* memory. {@code ht} shouldn't be shared across threads but can safely be
|
||||
* reused.
|
||||
*/
|
||||
public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException {
|
||||
Objects.checkFromIndexSize(off, len, bytes.length);
|
||||
compressWithDictionary(bytes, off, 0, len, out, ht);
|
||||
}
|
||||
|
||||
final int base = off;
|
||||
final int end = off + len;
|
||||
/**
|
||||
* Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into
|
||||
* {@code out} using at most 16kB of memory.
|
||||
* {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary.
|
||||
* {@code dictLen} must not be greater than 64kB, the maximum window size.
|
||||
*
|
||||
* {@code ht} shouldn't be shared across threads but can safely be reused.
|
||||
*/
|
||||
public static void compressWithDictionary(byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht) throws IOException {
|
||||
Objects.checkFromIndexSize(dictOff, dictLen, bytes.length);
|
||||
Objects.checkFromIndexSize(dictOff + dictLen, len, bytes.length);
|
||||
if (dictLen > MAX_DISTANCE) {
|
||||
throw new IllegalArgumentException("dictLen must not be greater than 64kB, but got " + dictLen);
|
||||
}
|
||||
|
||||
int anchor = off++;
|
||||
final int end = dictOff + dictLen + len;
|
||||
|
||||
int off = dictOff + dictLen;
|
||||
int anchor = off;
|
||||
|
||||
if (len > LAST_LITERALS + MIN_MATCH) {
|
||||
|
||||
final int limit = end - LAST_LITERALS;
|
||||
final int matchLimit = limit - MIN_MATCH;
|
||||
ht.reset(bytes, base, len);
|
||||
ht.reset(bytes, dictOff, dictLen + len);
|
||||
ht.initDictionary(dictLen);
|
||||
|
||||
main:
|
||||
while (off <= limit) {
|
||||
|
@ -417,7 +455,7 @@ public final class LZ4 {
|
|||
}
|
||||
ref = ht.get(off);
|
||||
if (ref != -1) {
|
||||
assert ref >= base && ref < off;
|
||||
assert ref >= dictOff && ref < off;
|
||||
assert readInt(bytes, ref) == readInt(bytes, off);
|
||||
break;
|
||||
}
|
||||
|
@ -428,7 +466,7 @@ public final class LZ4 {
|
|||
int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit);
|
||||
|
||||
// try to find a better match
|
||||
for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, base); r >= min; r = ht.previous(r)) {
|
||||
for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff); r >= min; r = ht.previous(r)) {
|
||||
assert readInt(bytes, r) == readInt(bytes, off);
|
||||
int rMatchLen = MIN_MATCH + commonBytes(bytes, r + MIN_MATCH, off + MIN_MATCH, limit);
|
||||
if (rMatchLen > matchLen) {
|
||||
|
|
|
@ -46,6 +46,12 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
assertTrue(in.assertReset());
|
||||
}
|
||||
|
||||
@Override
|
||||
void initDictionary(int dictLen) {
|
||||
assertTrue(in.assertReset());
|
||||
in.initDictionary(dictLen);
|
||||
}
|
||||
|
||||
@Override
|
||||
int get(int off) {
|
||||
return in.get(off);
|
||||
|
@ -64,7 +70,7 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void doTest(byte[] data, LZ4.HashTable hashTable) throws IOException {
|
||||
int offset = random().nextBoolean()
|
||||
int offset = data.length >= (1 << 16) || random().nextBoolean()
|
||||
? random().nextInt(10)
|
||||
: (1<<16) - data.length / 2; // this triggers special reset logic for high compression
|
||||
byte[] copy = new byte[data.length + offset + random().nextInt(10)];
|
||||
|
@ -135,8 +141,57 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
|
||||
// Now restore and compare bytes
|
||||
byte[] restored = new byte[length + random().nextInt(10)];
|
||||
LZ4.decompress(new ByteArrayDataInput(compressed), length, restored);
|
||||
LZ4.decompress(new ByteArrayDataInput(compressed), length, restored, 0);
|
||||
assertArrayEquals(ArrayUtil.copyOfSubArray(data, offset, offset+length), ArrayUtil.copyOfSubArray(restored, 0, length));
|
||||
|
||||
// Now restore with an offset
|
||||
int restoreOffset = TestUtil.nextInt(random(), 1, 10);
|
||||
restored = new byte[restoreOffset + length + random().nextInt(10)];
|
||||
LZ4.decompress(new ByteArrayDataInput(compressed), length, restored, restoreOffset);
|
||||
assertArrayEquals(ArrayUtil.copyOfSubArray(data, offset, offset+length), ArrayUtil.copyOfSubArray(restored, restoreOffset, restoreOffset+length));
|
||||
}
|
||||
|
||||
private void doTestWithDictionary(byte[] data, LZ4.HashTable hashTable) throws IOException {
|
||||
ByteBuffersDataOutput copy = new ByteBuffersDataOutput();
|
||||
int dictOff = TestUtil.nextInt(random(), 0, 10);
|
||||
copy.writeBytes(new byte[dictOff]);
|
||||
|
||||
// Create a dictionary from substrings of the input to compress
|
||||
int dictLen = 0;
|
||||
for (int i = TestUtil.nextInt(random(), 0, data.length); i < data.length && dictLen < LZ4.MAX_DISTANCE; ) {
|
||||
int l = Math.min(data.length - i, TestUtil.nextInt(random(), 1, 32));
|
||||
l = Math.min(l, LZ4.MAX_DISTANCE - dictLen);
|
||||
copy.writeBytes(data, i, l);
|
||||
dictLen += l;
|
||||
i += l;
|
||||
i += TestUtil.nextInt(random(), 1, 32);
|
||||
}
|
||||
|
||||
copy.writeBytes(data);
|
||||
copy.writeBytes(new byte[random().nextInt(10)]);
|
||||
|
||||
byte[] copyBytes = copy.toArrayCopy();
|
||||
doTestWithDictionary(copyBytes, dictOff, dictLen, data.length, hashTable);
|
||||
}
|
||||
|
||||
private void doTestWithDictionary(byte[] data, int dictOff, int dictLen, int length, LZ4.HashTable hashTable) throws IOException {
|
||||
ByteBuffersDataOutput out = new ByteBuffersDataOutput();
|
||||
LZ4.compressWithDictionary(data, dictOff, dictLen, length, out, hashTable);
|
||||
byte[] compressed = out.toArrayCopy();
|
||||
|
||||
// Compress once again with the same hash table to test reuse
|
||||
ByteBuffersDataOutput out2 = new ByteBuffersDataOutput();
|
||||
LZ4.compressWithDictionary(data, dictOff, dictLen, length, out2, hashTable);
|
||||
assertArrayEquals(compressed, out2.toArrayCopy());
|
||||
|
||||
// Now restore and compare bytes
|
||||
int restoreOffset = TestUtil.nextInt(random(), 1, 10);
|
||||
byte[] restored = new byte[restoreOffset + dictLen + length + random().nextInt(10)];
|
||||
System.arraycopy(data, dictOff, restored, restoreOffset, dictLen);
|
||||
LZ4.decompress(new ByteArrayDataInput(compressed), length, restored, dictLen + restoreOffset);
|
||||
assertArrayEquals(
|
||||
ArrayUtil.copyOfSubArray(data, dictOff+dictLen, dictOff+dictLen+length),
|
||||
ArrayUtil.copyOfSubArray(restored, restoreOffset+dictLen, restoreOffset+dictLen+length));
|
||||
}
|
||||
|
||||
public void testEmpty() throws IOException {
|
||||
|
@ -149,6 +204,7 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
// literals and matchs lengths <= 15
|
||||
final byte[] data = "1234562345673456745678910123".getBytes(StandardCharsets.UTF_8);
|
||||
doTest(data, newHashTable());
|
||||
doTestWithDictionary(data, newHashTable());
|
||||
}
|
||||
|
||||
public void testLongMatchs() throws IOException {
|
||||
|
@ -179,10 +235,11 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
byte[] b = new byte[TestUtil.nextInt(random(), 1, 1 << 32)];
|
||||
random().nextBytes(b);
|
||||
doTest(b, newHashTable());
|
||||
doTestWithDictionary(b, newHashTable());
|
||||
}
|
||||
|
||||
public void testCompressibleRandom() throws IOException {
|
||||
byte[] b = new byte[TestUtil.nextInt(random(), 1, 1 << 32)];
|
||||
byte[] b = new byte[TestUtil.nextInt(random(), 1, 1 << 18)];
|
||||
final int base = random().nextInt(256);
|
||||
final int maxDelta = 1 + random().nextInt(8);
|
||||
Random r = random();
|
||||
|
@ -190,6 +247,7 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
b[i] = (byte) (base + r.nextInt(maxDelta));
|
||||
}
|
||||
doTest(b, newHashTable());
|
||||
doTestWithDictionary(b, newHashTable());
|
||||
}
|
||||
|
||||
public void testLUCENE5201() throws IOException {
|
||||
|
@ -245,4 +303,21 @@ public abstract class LZ4TestCase extends LuceneTestCase {
|
|||
};
|
||||
doTest(data, 9, data.length - 9, newHashTable());
|
||||
}
|
||||
|
||||
public void testUseDictionary() throws IOException {
|
||||
byte[] b = new byte[] {
|
||||
1, 2, 3, 4, 5, 6, // dictionary
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
|
||||
};
|
||||
int dictOff = 0;
|
||||
int dictLen = 6;
|
||||
int len = b.length - dictLen;
|
||||
|
||||
doTestWithDictionary(b, dictOff, dictLen, len, newHashTable());
|
||||
ByteBuffersDataOutput out = new ByteBuffersDataOutput();
|
||||
LZ4.compressWithDictionary(b, dictOff, dictLen, len, out, newHashTable());
|
||||
|
||||
// The compressed output is smaller than the original input despite being incompressible on its own
|
||||
assertTrue(out.size() < len);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,14 +33,11 @@ import org.apache.lucene.index.NoDeletionPolicy;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
// See: https://github.com/DmitryKey/luke/issues/111
|
||||
@LuceneTestCase.SuppressCodecs({
|
||||
"SimpleText", "DeflateWithPresetCompressingStoredFieldsData", "DummyCompressingStoredFieldsData", "HighCompressionCompressingStoredFieldsData", "FastCompressingStoredFieldsData", "FastDecompressionCompressingStoredFieldsData"
|
||||
})
|
||||
public class CommitsImplTest extends LuceneTestCase {
|
||||
|
||||
private DirectoryReader reader;
|
||||
|
@ -63,7 +60,7 @@ public class CommitsImplTest extends LuceneTestCase {
|
|||
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
|
||||
IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())).setCodec(TestUtil.getDefaultCodec());
|
||||
config.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ public abstract class CompressingCodec extends FilterCodec {
|
|||
* Create a random instance.
|
||||
*/
|
||||
public static CompressingCodec randomInstance(Random random, int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockShift) {
|
||||
switch (random.nextInt(5)) {
|
||||
switch (random.nextInt(6)) {
|
||||
case 0:
|
||||
return new FastCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix, blockShift);
|
||||
case 1:
|
||||
|
@ -48,6 +48,8 @@ public abstract class CompressingCodec extends FilterCodec {
|
|||
return new DummyCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix, blockShift);
|
||||
case 4:
|
||||
return new DeflateWithPresetCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix, blockShift);
|
||||
case 5:
|
||||
return new LZ4WithPresetCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix, blockShift);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
|
|
@ -16,16 +16,16 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat.DeflateWithPresetDict;
|
||||
import org.apache.lucene.codecs.lucene87.DeflateWithPresetDictCompressionMode;
|
||||
|
||||
/** CompressionCodec that uses {@link DeflateWithPresetDict}. */
|
||||
/** CompressionCodec that uses {@link DeflateWithPresetDictCompressionMode}. */
|
||||
public class DeflateWithPresetCompressingCodec extends CompressingCodec {
|
||||
|
||||
/** Constructor that allows to configure the chunk size. */
|
||||
public DeflateWithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
|
||||
super("DeflateWithPresetCompressingStoredFieldsData",
|
||||
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
|
||||
new DeflateWithPresetDict(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
|
||||
new DeflateWithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
|
||||
}
|
||||
|
||||
/** No-arg constructor. */
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
import org.apache.lucene.codecs.lucene87.LZ4WithPresetDictCompressionMode;
|
||||
|
||||
/** CompressionCodec that uses {@link LZ4WithPresetDictCompressionMode}. */
|
||||
public class LZ4WithPresetCompressingCodec extends CompressingCodec {
|
||||
|
||||
/** Constructor that allows to configure the chunk size. */
|
||||
public LZ4WithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
|
||||
super("LZ4WithPresetCompressingStoredFieldsData",
|
||||
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
|
||||
new LZ4WithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
|
||||
}
|
||||
|
||||
/** No-arg constructor. */
|
||||
public LZ4WithPresetCompressingCodec() {
|
||||
this(1<<18, 512, false, 10);
|
||||
}
|
||||
|
||||
}
|
|
@ -19,4 +19,5 @@ org.apache.lucene.codecs.compressing.DeflateWithPresetCompressingCodec
|
|||
org.apache.lucene.codecs.compressing.FastCompressingCodec
|
||||
org.apache.lucene.codecs.compressing.FastDecompressionCompressingCodec
|
||||
org.apache.lucene.codecs.compressing.HighCompressionCompressingCodec
|
||||
org.apache.lucene.codecs.compressing.LZ4WithPresetCompressingCodec
|
||||
org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec
|
||||
|
|
Loading…
Reference in New Issue