LUCENE-4702: Terms dictionary compression. (#1126)

Compress blocks of suffixes in order to make the terms dictionary more
space-efficient. Two compression algorithms are used depending on which one is
more space-efficient:
 - LowercaseAsciiCompression, which applies when all bytes are in the
   `[0x1F,0x3F)` or `[0x5F,0x7F)` ranges, which notably include all digits,
   lowercase ASCII characters, '.', '-' and '_', and encodes 4 chars on 3 bytes.
   It is very often applicable on analyzed content and decompresses very quickly
   thanks to auto-vectorization support in the JVM.
 - LZ4, when the compression ratio is less than 0.75.

I was a bit unhappy with the complexity of the high-compression LZ4 option, so
I simplified it in order to only keep the logic that detects duplicate strings.
The logic about what to do in case overlapping matches are found, which was
responsible for most of the complexity while only yielding tiny benefits, has
been removed.
This commit is contained in:
Adrien Grand 2020-01-24 14:46:57 +01:00 committed by GitHub
parent a29a4f4aa5
commit b283b8df62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 1463 additions and 799 deletions

View File

@ -473,3 +473,35 @@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---
core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java
implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib)
compression format for Lucene's DataInput/DataOutput abstractions.
LZ4 Library
Copyright (c) 2011-2016, Yann Collet
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -30,6 +30,11 @@ http://bitbucket.org/jpbarrette/moman/overview/
The class org.apache.lucene.util.WeakIdentityMap was derived from
the Apache CXF project and is Apache License 2.0.
The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4
compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed
under the 2-clause BSD license.
(https://opensource.org/licenses/bsd-license.php)
The Google Code Prettify is Apache License 2.0.
See http://code.google.com/p/google-code-prettify/

View File

@ -131,8 +131,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** The long[] + byte[] metadata has been replaced with a single byte[]. */
public static final int VERSION_META_LONGS_REMOVED = 4;
/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_META_LONGS_REMOVED;
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
@ -35,6 +36,7 @@ import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
@ -45,10 +47,12 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.compress.LZ4;
import org.apache.lucene.util.compress.LowercaseAsciiCompression;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util;
/*
@ -635,6 +639,16 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
newBlocks.clear();
}
private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) {
Objects.checkFromToIndex(startOffset, endOffset, b.length);
for (int i = startOffset; i < endOffset; ++i) {
if (b[i] != value) {
return false;
}
}
return true;
}
/** Writes the specified slice (start is inclusive, end is exclusive)
* from pending stack as a new block. If isFloor is true, there
* were too many (more than maxItemsInBlock) entries sharing the
@ -703,8 +717,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
//}
// For leaf block we write suffix straight
suffixWriter.writeVInt(suffix);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt(suffix);
suffixWriter.append(term.termBytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
// Write term stats, to separate byte[] blob:
@ -741,8 +755,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// it's a prefix term. Terms cannot be larger than ~32 KB
// so we won't run out of bits:
suffixWriter.writeVInt(suffix << 1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt(suffix << 1);
suffixWriter.append(term.termBytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
@ -772,8 +786,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block:f
suffixWriter.writeVInt((suffix<<1)|1);
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt((suffix<<1)|1);
suffixWriter.append(block.prefix.bytes, prefixLength, suffix);
//if (DEBUG2) {
// BytesRef suffixBytes = new BytesRef(suffix);
@ -785,7 +799,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel: "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff);
assert block.fp < startFP;
suffixWriter.writeVLong(startFP - block.fp);
suffixLengthsWriter.writeVLong(startFP - block.fp);
subIndices.add(block.index);
}
}
@ -793,19 +807,69 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert subIndices.size() != 0;
}
// TODO: we could block-write the term suffix pointers;
// this would take more space but would enable binary
// search on lookup
// Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 or with LowercaseAsciiCompression.
CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION;
// If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix are unlikely what
// makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
// auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
if (suffixWriter.length() > 2L * numEntries) {
LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
// LZ4 saved more than 25%, go for it
compressionAlg = CompressionAlgorithm.LZ4;
} else {
spareWriter.reset();
if (spareBytes.length < suffixWriter.length()) {
spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
}
if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) {
compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII;
}
}
}
long token = ((long) suffixWriter.length()) << 3;
if (isLeafBlock) {
token |= 0x04;
}
token |= compressionAlg.code;
termsOut.writeVLong(token);
if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) {
termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length());
} else {
spareWriter.copyTo(termsOut);
}
suffixWriter.setLength(0);
spareWriter.reset();
// Write suffixes byte[] blob to terms dict output:
termsOut.writeVInt((int) (suffixWriter.size() << 1) | (isLeafBlock ? 1:0));
suffixWriter.copyTo(termsOut);
suffixWriter.reset();
// Write suffix lengths
final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.size());
spareBytes = ArrayUtil.grow(spareBytes, numSuffixBytes);
suffixLengthsWriter.copyTo(new ByteArrayDataOutput(spareBytes));
suffixLengthsWriter.reset();
if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) {
// Structured fields like IDs often have most values of the same length
termsOut.writeVInt((numSuffixBytes << 1) | 1);
termsOut.writeByte(spareBytes[0]);
} else {
// Still give LZ4 a chance, there might be runs of terms with the same length
termsOut.writeVInt(numSuffixBytes << 1);
LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable);
}
// Write term stats byte[] blob
termsOut.writeVInt((int) statsWriter.size());
statsWriter.copyTo(termsOut);
// Stats
final int numStatsBytes = Math.toIntExact(statsWriter.size());
spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes);
statsWriter.copyTo(new ByteArrayDataOutput(spareBytes));
statsWriter.reset();
if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) {
// ID fields would typically have blocks full of ones
// LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster
termsOut.writeVInt((numStatsBytes << 1) | 1);
} else {
// Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones
termsOut.writeVInt(numStatsBytes << 1);
LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable);
}
// Write term meta data byte[] blob
termsOut.writeVInt((int) metaWriter.size());
@ -953,9 +1017,13 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
}
private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance();
private final ByteBuffersDataOutput suffixLengthsWriter = ByteBuffersDataOutput.newResettableInstance();
private final BytesRefBuilder suffixWriter = new BytesRefBuilder();
private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance();
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
private final ByteBuffersDataOutput spareWriter = ByteBuffersDataOutput.newResettableInstance();
private byte[] spareBytes = BytesRef.EMPTY_BYTES;
private final LZ4.HighCompressionHashTable compressionHashTable = new LZ4.HighCompressionHashTable();
}
private boolean closed;

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.compress.LowercaseAsciiCompression;
/**
* Compression algorithm used for suffixes of a block of terms.
*/
enum CompressionAlgorithm {
NO_COMPRESSION(0x00) {
@Override
void read(DataInput in, byte[] out, int len) throws IOException {
in.readBytes(out, 0, len);
}
},
LOWERCASE_ASCII(0x01) {
@Override
void read(DataInput in, byte[] out, int len) throws IOException {
LowercaseAsciiCompression.decompress(in, out, len);
}
},
LZ4(0x02) {
@Override
void read(DataInput in, byte[] out, int len) throws IOException {
org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0);
}
};
private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3];
static {
for (CompressionAlgorithm alg : CompressionAlgorithm.values()) {
BY_CODE[alg.code] = alg;
}
}
/**
* Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}.
*/
static final CompressionAlgorithm byCode(int code) {
if (code < 0 || code >= BY_CODE.length) {
throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code);
}
return BY_CODE[code];
}
public final int code;
private CompressionAlgorithm(int code) {
this.code = code;
}
abstract void read(DataInput in, byte[] out, int len) throws IOException;
}

View File

@ -252,6 +252,7 @@ final class IntersectTermsEnum extends BaseTermsEnum {
while (true) {
final int savNextEnt = currentFrame.nextEnt;
final int savePos = currentFrame.suffixesReader.getPosition();
final int saveLengthPos = currentFrame.suffixLengthsReader.getPosition();
final int saveStartBytePos = currentFrame.startBytePos;
final int saveSuffix = currentFrame.suffix;
final long saveLastSubFP = currentFrame.lastSubFP;
@ -294,6 +295,7 @@ final class IntersectTermsEnum extends BaseTermsEnum {
currentFrame.startBytePos = saveStartBytePos;
currentFrame.suffix = saveSuffix;
currentFrame.suffixesReader.setPosition(savePos);
currentFrame.suffixLengthsReader.setPosition(saveLengthPos);
currentFrame.termState.termBlockOrd = saveTermBlockOrd;
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = currentFrame.prefix + currentFrame.suffix;

View File

@ -18,13 +18,16 @@ package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.compress.LZ4;
import org.apache.lucene.util.fst.FST;
// TODO: can we share this with the frame in STE?
@ -48,6 +51,9 @@ final class IntersectTermsEnumFrame {
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] suffixLengthBytes;
final ByteArrayDataInput suffixLengthsReader;
byte[] statBytes = new byte[64];
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
@ -93,12 +99,21 @@ final class IntersectTermsEnumFrame {
int suffix;
private final IntersectTermsEnum ite;
private final int version;
public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException {
this.ite = ite;
this.ord = ord;
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.version = ite.fr.parent.version;
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
suffixLengthBytes = new byte[32];
suffixLengthsReader = new ByteArrayDataInput();
} else {
suffixLengthBytes = null;
suffixLengthsReader = suffixesReader;
}
}
void loadNextFloorBlock() throws IOException {
@ -170,6 +185,35 @@ final class IntersectTermsEnumFrame {
isLastInFloor = (code & 1) != 0;
// term suffixes:
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
final long codeL = ite.in.readVLong();
isLeafBlock = (codeL & 0x04) != 0;
final int numSuffixBytes = (int) (codeL >>> 3);
if (suffixBytes.length < numSuffixBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)];
}
final CompressionAlgorithm compressionAlg;
try {
compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03);
} catch (IllegalArgumentException e) {
throw new CorruptIndexException(e.getMessage(), ite.in, e);
}
compressionAlg.read(ite.in, suffixBytes, numSuffixBytes);
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
int numSuffixLengthBytes = ite.in.readVInt();
final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
numSuffixLengthBytes >>>= 1;
if (suffixLengthBytes.length < numSuffixLengthBytes) {
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
}
if (allEqual) {
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte());
} else {
LZ4.decompress(ite.in, numSuffixLengthBytes, suffixLengthBytes, 0);
}
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
} else {
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
@ -178,13 +222,27 @@ final class IntersectTermsEnumFrame {
}
ite.in.readBytes(suffixBytes, 0, numBytes);
suffixesReader.reset(suffixBytes, 0, numBytes);
}
// stats
numBytes = ite.in.readVInt();
int numBytes = ite.in.readVInt();
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
final boolean allOnes = (numBytes & 0x01) != 0;
numBytes >>>= 1;
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
if (allOnes) {
Arrays.fill(statBytes, 0, numBytes, (byte) 1);
} else {
LZ4.decompress(ite.in, numBytes, statBytes, 0);
}
} else {
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(statBytes, 0, numBytes);
}
statsReader.reset(statBytes, 0, numBytes);
metaDataUpto = 0;
@ -221,7 +279,7 @@ final class IntersectTermsEnumFrame {
public void nextLeaf() {
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
suffix = suffixLengthsReader.readVInt();
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
}
@ -229,7 +287,7 @@ final class IntersectTermsEnumFrame {
public boolean nextNonLeaf() {
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
final int code = suffixLengthsReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
@ -239,7 +297,7 @@ final class IntersectTermsEnumFrame {
return false;
} else {
// A sub-block; make sub-FP absolute:
lastSubFP = fp - suffixesReader.readVLong();
lastSubFP = fp - suffixLengthsReader.readVLong();
return true;
}
}

View File

@ -21,11 +21,13 @@ import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.compress.LZ4;
import org.apache.lucene.util.fst.FST;
final class SegmentTermsEnumFrame {
@ -44,10 +46,14 @@ final class SegmentTermsEnumFrame {
long fp;
long fpOrig;
long fpEnd;
long totalSuffixBytes, totalStatsBytes; // for stats
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] suffixLengthBytes;
final ByteArrayDataInput suffixLengthsReader;
byte[] statBytes = new byte[64];
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
@ -90,12 +96,21 @@ final class SegmentTermsEnumFrame {
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
private final SegmentTermsEnum ste;
private final int version;
public SegmentTermsEnumFrame(SegmentTermsEnum ste, int ord) throws IOException {
this.ste = ste;
this.ord = ord;
this.state = ste.fr.parent.postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.version = ste.fr.parent.version;
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
suffixLengthBytes = new byte[32];
suffixLengthsReader = new ByteArrayDataInput();
} else {
suffixLengthBytes = null;
suffixLengthsReader = suffixesReader;
}
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@ -161,7 +176,36 @@ final class SegmentTermsEnumFrame {
// instead of linear scan to find target term; eg
// we could have simple array of offsets
final long startSuffixFP = ste.in.getFilePointer();
// term suffixes:
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
final long codeL = ste.in.readVLong();
isLeafBlock = (codeL & 0x04) != 0;
final int numSuffixBytes = (int) (codeL >>> 3);
if (suffixBytes.length < numSuffixBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)];
}
try {
compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03);
} catch (IllegalArgumentException e) {
throw new CorruptIndexException(e.getMessage(), ste.in, e);
}
compressionAlg.read(ste.in, suffixBytes, numSuffixBytes);
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
int numSuffixLengthBytes = ste.in.readVInt();
final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
numSuffixLengthBytes >>>= 1;
if (suffixLengthBytes.length < numSuffixLengthBytes) {
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
}
if (allEqual) {
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte());
} else {
LZ4.decompress(ste.in, numSuffixLengthBytes, suffixLengthBytes, 0);
}
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
} else {
code = ste.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
@ -170,6 +214,8 @@ final class SegmentTermsEnumFrame {
}
ste.in.readBytes(suffixBytes, 0, numBytes);
suffixesReader.reset(suffixBytes, 0, numBytes);
}
totalSuffixBytes = ste.in.getFilePointer() - startSuffixFP;
/*if (DEBUG) {
if (arc == null) {
@ -180,11 +226,26 @@ final class SegmentTermsEnumFrame {
}*/
// stats
numBytes = ste.in.readVInt();
final long startStatsFP = ste.in.getFilePointer();
int numBytes = ste.in.readVInt();
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
final boolean allOnes = (numBytes & 0x01) != 0;
numBytes >>>= 1;
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
if (allOnes) {
Arrays.fill(statBytes, 0, numBytes, (byte) 1);
} else {
LZ4.decompress(ste.in, numBytes, statBytes, 0);
}
} else {
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(statBytes, 0, numBytes);
}
totalStatsBytes = ste.in.getFilePointer() - startStatsFP;
statsReader.reset(statBytes, 0, numBytes);
metaDataUpto = 0;
@ -272,7 +333,7 @@ final class SegmentTermsEnumFrame {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
suffix = suffixLengthsReader.readVInt();
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
@ -296,7 +357,7 @@ final class SegmentTermsEnumFrame {
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
final int code = suffixLengthsReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
@ -311,7 +372,7 @@ final class SegmentTermsEnumFrame {
} else {
// A sub-block; make sub-FP absolute:
ste.termExists = false;
subCode = suffixesReader.readVLong();
subCode = suffixLengthsReader.readVLong();
lastSubFP = fp - subCode;
//if (DEBUG) {
//System.out.println(" lastSubFP=" + lastSubFP);
@ -458,10 +519,10 @@ final class SegmentTermsEnumFrame {
while(true) {
assert nextEnt < entCount;
nextEnt++;
final int code = suffixesReader.readVInt();
final int code = suffixLengthsReader.readVInt();
suffixesReader.skipBytes(code >>> 1);
if ((code & 1) != 0) {
final long subCode = suffixesReader.readVLong();
final long subCode = suffixLengthsReader.readVLong();
if (targetSubCode == subCode) {
//if (DEBUG) System.out.println(" match!");
lastSubFP = subFP;
@ -481,6 +542,7 @@ final class SegmentTermsEnumFrame {
private int startBytePos;
private int suffix;
private long subCode;
CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION;
// for debugging
/*
@ -517,11 +579,13 @@ final class SegmentTermsEnumFrame {
assert prefixMatches(target);
// TODO: binary search when all terms have the same length, which is common for ID fields,
// which are also the most sensitive to lookup performance?
// Loop over each entry (term or sub-block) in this block:
do {
nextEnt++;
suffix = suffixesReader.readVInt();
suffix = suffixLengthsReader.readVInt();
// if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef();
@ -606,7 +670,7 @@ final class SegmentTermsEnumFrame {
nextEnt++;
final int code = suffixesReader.readVInt();
final int code = suffixLengthsReader.readVInt();
suffix = code >>> 1;
//if (DEBUG) {
@ -625,7 +689,7 @@ final class SegmentTermsEnumFrame {
state.termBlockOrd++;
subCode = 0;
} else {
subCode = suffixesReader.readVLong();
subCode = suffixLengthsReader.readVLong();
lastSubFP = fp - subCode;
}

View File

@ -75,11 +75,25 @@ public class Stats {
/** Total number of bytes used to store term suffixes. */
public long totalBlockSuffixBytes;
/**
* Number of times each compression method has been used.
* 0 = uncompressed
* 1 = lowercase_ascii
* 2 = LZ4
*/
public final long[] compressionAlgorithms = new long[3];
/** Total number of suffix bytes before compression. */
public long totalUncompressedBlockSuffixBytes;
/** Total number of bytes used to store term stats (not
* including what the {@link PostingsReaderBase}
* stores. */
public long totalBlockStatsBytes;
/** Total number of bytes used to store stats. */
public long totalUncompressedBlockStatsBytes;
/** Total bytes stored by the {@link PostingsReaderBase},
* plus the other few vInts stored in the frame. */
public long totalBlockOtherBytes;
@ -111,8 +125,14 @@ public class Stats {
}
blockCountByPrefixLen[frame.prefix]++;
startBlockCount++;
totalBlockSuffixBytes += frame.suffixesReader.length();
totalBlockStatsBytes += frame.statsReader.length();
totalBlockSuffixBytes += frame.totalSuffixBytes;
totalUncompressedBlockSuffixBytes += frame.suffixesReader.length();
if (frame.suffixesReader != frame.suffixLengthsReader) {
totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length();
}
totalBlockStatsBytes += frame.totalStatsBytes;
compressionAlgorithms[frame.compressionAlg.code]++;
totalUncompressedBlockStatsBytes += frame.statsReader.length();
}
void endBlock(SegmentTermsEnumFrame frame) {
@ -129,7 +149,7 @@ public class Stats {
throw new IllegalStateException();
}
endBlockCount++;
final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length();
final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.totalStatsBytes;
assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
totalBlockOtherBytes += otherBytes;
}
@ -167,8 +187,23 @@ public class Stats {
out.println(" " + floorBlockCount + " floor blocks");
out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks");
out.println(" " + floorSubBlockCount + " floor sub-blocks");
out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
out.println(" " + totalUncompressedBlockSuffixBytes + " term suffix bytes before compression" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
StringBuilder compressionCounts = new StringBuilder();
for (int code = 0; code < compressionAlgorithms.length; ++code) {
if (compressionAlgorithms[code] == 0) {
continue;
}
if (compressionCounts.length() > 0) {
compressionCounts.append(", ");
}
compressionCounts.append(CompressionAlgorithm.byCode(code));
compressionCounts.append(": ");
compressionCounts.append(compressionAlgorithms[code]);
}
out.println(" " + totalBlockSuffixBytes + " compressed term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes)/totalUncompressedBlockSuffixBytes) +
" compression ratio - compression count by algorithm: " + compressionCounts : "") + ")");
out.println(" " + totalUncompressedBlockStatsBytes + " term stats bytes before compression" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalUncompressedBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
out.println(" " + totalBlockStatsBytes + " compressed term stats bytes (" + String.format(Locale.ROOT, "%.2f", (double)totalBlockStatsBytes / totalUncompressedBlockStatsBytes) + " compression ratio)");
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
if (totalBlockCount != 0) {
out.println(" by prefix length:");

View File

@ -27,6 +27,7 @@ import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.compress.LZ4;
/**
* A compression mode. Tells how much effort should be spent on compression and
@ -152,10 +153,10 @@ public abstract class CompressionMode {
private static final class LZ4FastCompressor extends Compressor {
private final LZ4.HashTable ht;
private final LZ4.FastCompressionHashTable ht;
LZ4FastCompressor() {
ht = new LZ4.HashTable();
ht = new LZ4.FastCompressionHashTable();
}
@Override
@ -172,16 +173,16 @@ public abstract class CompressionMode {
private static final class LZ4HighCompressor extends Compressor {
private final LZ4.HCHashTable ht;
private final LZ4.HighCompressionHashTable ht;
LZ4HighCompressor() {
ht = new LZ4.HCHashTable();
ht = new LZ4.HighCompressionHashTable();
}
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out)
throws IOException {
LZ4.compressHC(bytes, off, len, out, ht);
LZ4.compress(bytes, off, len, out, ht);
}
@Override

View File

@ -1,544 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
/**
* LZ4 compression and decompression routines.
*
* http://code.google.com/p/lz4/
* http://fastcompression.blogspot.fr/p/lz4.html
*/
final class LZ4 {
private LZ4() {}
static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
static final int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
private static int hash(int i, int hashBits) {
return (i * -1640531535) >>> (32 - hashBits);
}
private static int hashHC(int i) {
return hash(i, HASH_LOG_HC);
}
private static int readInt(byte[] buf, int i) {
return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF);
}
private static boolean readIntEquals(byte[] buf, int i, int j) {
return readInt(buf, i) == readInt(buf, j);
}
private static int commonBytes(byte[] b, int o1, int o2, int limit) {
assert o1 < o2;
// never -1 because lengths always differ
return Arrays.mismatch(b, o1, limit, b, o2, limit);
}
private static int commonBytesBackward(byte[] b, int o1, int o2, int l1, int l2) {
int count = 0;
while (o1 > l1 && o2 > l2 && b[--o1] == b[--o2]) {
++count;
}
return count;
}
/**
* Decompress at least <code>decompressedLen</code> bytes into
* <code>dest[dOff:]</code>. Please note that <code>dest</code> must be large
* enough to be able to hold <b>all</b> decompressed data (meaning that you
* need to know the total decompressed length).
*/
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException {
final int destEnd = dest.length;
do {
// literals
final int token = compressed.readByte() & 0xFF;
int literalLen = token >>> 4;
if (literalLen != 0) {
if (literalLen == 0x0F) {
byte len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
literalLen += 0xFF;
}
literalLen += len & 0xFF;
}
compressed.readBytes(dest, dOff, literalLen);
dOff += literalLen;
}
if (dOff >= decompressedLen) {
break;
}
// matchs
final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8);
assert matchDec > 0;
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
int len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
matchLen += 0xFF;
}
matchLen += len & 0xFF;
}
matchLen += MIN_MATCH;
// copying a multiple of 8 bytes can make decompression from 5% to 10% faster
final int fastLen = (matchLen + 7) & 0xFFFFFFF8;
if (matchDec < matchLen || dOff + fastLen > destEnd) {
// overlap -> naive incremental copy
for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) {
dest[dOff] = dest[ref];
}
} else {
// no overlap -> arraycopy
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
dOff += matchLen;
}
} while (dOff < decompressedLen);
return dOff;
}
private static void encodeLen(int l, DataOutput out) throws IOException {
while (l >= 0xFF) {
out.writeByte((byte) 0xFF);
l -= 0xFF;
}
out.writeByte((byte) l);
}
private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException {
out.writeByte((byte) token);
// encode literal length
if (literalLen >= 0x0F) {
encodeLen(literalLen - 0x0F, out);
}
// encode literals
out.writeBytes(bytes, anchor, literalLen);
}
private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException {
final int token = Math.min(literalLen, 0x0F) << 4;
encodeLiterals(bytes, token, anchor, literalLen, out);
}
private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException {
final int literalLen = matchOff - anchor;
assert matchLen >= 4;
// encode token
final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F);
encodeLiterals(bytes, token, anchor, literalLen, out);
// encode match dec
final int matchDec = matchOff - matchRef;
assert matchDec > 0 && matchDec < 1 << 16;
out.writeByte((byte) matchDec);
out.writeByte((byte) (matchDec >>> 8));
// encode match len
if (matchLen >= MIN_MATCH + 0x0F) {
encodeLen(matchLen - 0x0F - MIN_MATCH, out);
}
}
static final class HashTable {
private int hashLog;
private PackedInts.Mutable hashTable;
void reset(int len) {
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
if (hashTable == null || hashTable.size() < 1 << hashLog || hashTable.getBitsPerValue() < bitsPerOffset) {
hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT);
} else {
hashTable.clear();
}
}
}
/**
* Compress <code>bytes[off:off+len]</code> into <code>out</code> using
* at most 16KB of memory. <code>ht</code> shouldn't be shared across threads
* but can safely be reused.
*/
public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException {
final int base = off;
final int end = off + len;
int anchor = off++;
if (len > LAST_LITERALS + MIN_MATCH) {
final int limit = end - LAST_LITERALS;
final int matchLimit = limit - MIN_MATCH;
ht.reset(len);
final int hashLog = ht.hashLog;
final PackedInts.Mutable hashTable = ht.hashTable;
main:
while (off <= limit) {
// find a match
int ref;
while (true) {
if (off >= matchLimit) {
break main;
}
final int v = readInt(bytes, off);
final int h = hash(v, hashLog);
ref = base + (int) hashTable.get(h);
assert PackedInts.bitsRequired(off - base) <= hashTable.getBitsPerValue();
hashTable.set(h, off - base);
if (off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) {
break;
}
++off;
}
// compute match length
final int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit);
encodeSequence(bytes, anchor, ref, off, matchLen, out);
off += matchLen;
anchor = off;
}
}
// last literals
final int literalLen = end - anchor;
assert literalLen >= LAST_LITERALS || literalLen == len;
encodeLastLiterals(bytes, anchor, end - anchor, out);
}
private static class Match {
int start, ref, len;
void fix(int correction) {
start += correction;
ref += correction;
len -= correction;
}
int end() {
return start + len;
}
}
private static void copyTo(Match m1, Match m2) {
m2.len = m1.len;
m2.start = m1.start;
m2.ref = m1.ref;
}
static final class HCHashTable {
static final int MAX_ATTEMPTS = 256;
static final int MASK = MAX_DISTANCE - 1;
int nextToUpdate;
private int base;
private final int[] hashTable;
private final short[] chainTable;
HCHashTable() {
hashTable = new int[HASH_TABLE_SIZE_HC];
chainTable = new short[MAX_DISTANCE];
}
private void reset(int base) {
this.base = base;
nextToUpdate = base;
Arrays.fill(hashTable, -1);
Arrays.fill(chainTable, (short) 0);
}
private int hashPointer(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
return hashTable[h];
}
private int next(int off) {
return off - (chainTable[off & MASK] & 0xFFFF);
}
private void addHash(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
int delta = off - hashTable[h];
assert delta > 0 : delta;
if (delta >= MAX_DISTANCE) {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
hashTable[h] = off;
}
void insert(int off, byte[] bytes) {
for (; nextToUpdate < off; ++nextToUpdate) {
addHash(bytes, nextToUpdate);
}
}
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
match.start = off;
match.len = 0;
int delta = 0;
int repl = 0;
insert(off, buf);
int ref = hashPointer(buf, off);
if (ref >= off - 4 && ref <= off && ref >= base) { // potential repetition
if (readIntEquals(buf, ref, off)) { // confirmed
delta = off - ref;
repl = match.len = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
match.ref = ref;
}
ref = next(ref);
}
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
final int matchLen = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
if (matchLen > match.len) {
match.ref = ref;
match.len = matchLen;
}
}
ref = next(ref);
}
if (repl != 0) {
int ptr = off;
final int end = off + repl - (MIN_MATCH - 1);
while (ptr < end - delta) {
chainTable[ptr & MASK] = (short) delta; // pre load
++ptr;
}
do {
chainTable[ptr & MASK] = (short) delta;
hashTable[hashHC(readInt(buf, ptr))] = ptr;
++ptr;
} while (ptr < end);
nextToUpdate = end;
}
return match.len != 0;
}
boolean insertAndFindWiderMatch(byte[] buf, int off, int startLimit, int matchLimit, int minLen, Match match) {
match.len = minLen;
insert(off, buf);
final int delta = off - startLimit;
int ref = hashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
&& readIntEquals(buf, ref, off)) {
final int matchLenForward = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
final int matchLenBackward = commonBytesBackward(buf, ref, off, base, startLimit);
final int matchLen = matchLenBackward + matchLenForward;
if (matchLen > match.len) {
match.len = matchLen;
match.ref = ref - matchLenBackward;
match.start = off - matchLenBackward;
}
}
ref = next(ref);
}
return match.len > minLen;
}
}
/**
* Compress <code>bytes[off:off+len]</code> into <code>out</code>. Compared to
* {@link LZ4#compress(byte[], int, int, DataOutput, HashTable)}, this method
* is slower and uses more memory (~ 256KB per thread) but should provide
* better compression ratios (especially on large inputs) because it chooses
* the best match among up to 256 candidates and then performs trade-offs to
* fix overlapping matches. <code>ht</code> shouldn't be shared across threads
* but can safely be reused.
*/
public static void compressHC(byte[] src, int srcOff, int srcLen, DataOutput out, HCHashTable ht) throws IOException {
final int srcEnd = srcOff + srcLen;
final int matchLimit = srcEnd - LAST_LITERALS;
final int mfLimit = matchLimit - MIN_MATCH;
int sOff = srcOff;
int anchor = sOff++;
ht.reset(srcOff);
final Match match0 = new Match();
final Match match1 = new Match();
final Match match2 = new Match();
final Match match3 = new Match();
main:
while (sOff <= mfLimit) {
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
++sOff;
continue;
}
// saved, in case we would skip too much
copyTo(match1, match0);
search2:
while (true) {
assert match1.start >= anchor;
if (match1.end() >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
// no better match
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
continue main;
}
if (match0.start < match1.start) {
if (match2.start < match1.start + match0.len) { // empirical
copyTo(match0, match1);
}
}
assert match2.start > match1.start;
if (match2.start - match1.start < 3) { // First Match too small : removed
copyTo(match2, match1);
continue search2;
}
search3:
while (true) {
if (match2.start - match1.start < OPTIMAL_ML) {
int newMatchLen = match1.len;
if (newMatchLen > OPTIMAL_ML) {
newMatchLen = OPTIMAL_ML;
}
if (match1.start + newMatchLen > match2.end() - MIN_MATCH) {
newMatchLen = match2.start - match1.start + match2.len - MIN_MATCH;
}
final int correction = newMatchLen - (match2.start - match1.start);
if (correction > 0) {
match2.fix(correction);
}
}
if (match2.start + match2.len >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
// no better match -> 2 sequences to encode
if (match2.start < match1.end()) {
match1.len = match2.start - match1.start;
}
// encode seq 1
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
// encode seq 2
encodeSequence(src, anchor, match2.ref, match2.start, match2.len, out);
anchor = sOff = match2.end();
continue main;
}
if (match3.start < match1.end() + 3) { // Not enough space for match 2 : remove it
if (match3.start >= match1.end()) { // // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
if (match2.start < match1.end()) {
final int correction = match1.end() - match2.start;
match2.fix(correction);
if (match2.len < MIN_MATCH) {
copyTo(match3, match2);
}
}
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
copyTo(match3, match1);
copyTo(match2, match0);
continue search2;
}
copyTo(match3, match2);
continue search3;
}
// OK, now we have 3 ascending matches; let's write at least the first one
if (match2.start < match1.end()) {
if (match2.start - match1.start < 0x0F) {
if (match1.len > OPTIMAL_ML) {
match1.len = OPTIMAL_ML;
}
if (match1.end() > match2.end() - MIN_MATCH) {
match1.len = match2.end() - match1.start - MIN_MATCH;
}
final int correction = match1.end() - match2.start;
match2.fix(correction);
} else {
match1.len = match2.start - match1.start;
}
}
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();
copyTo(match2, match1);
copyTo(match3, match2);
continue search3;
}
}
}
encodeLastLiterals(src, anchor, srcEnd - anchor, out);
}
}

View File

@ -386,7 +386,9 @@ public final class Lucene84PostingsFormat extends PostingsFormat {
// Increment version to change it
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
// Better compression of the terms dictionary in case most terms have a docFreq of 1
final static int VERSION_COMPRESSED_TERMS_DICT_IDS = 1;
final static int VERSION_CURRENT = VERSION_COMPRESSED_TERMS_DICT_IDS;
private final int minTermBlockSize;
private final int maxTermBlockSize;

View File

@ -23,6 +23,7 @@ import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.MAX_SKIP_
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.VERSION_COMPRESSED_TERMS_DICT_IDS;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.VERSION_START;
@ -44,6 +45,7 @@ import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@ -179,18 +181,37 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
termState.payStartFP = 0;
}
if (version >= VERSION_COMPRESSED_TERMS_DICT_IDS) {
final long l = in.readVLong();
if ((l & 0x01) == 0) {
termState.docStartFP += l >>> 1;
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
} else {
termState.singletonDocID = -1;
}
} else {
assert absolute == false;
assert termState.singletonDocID != -1;
termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1);
}
} else {
termState.docStartFP += in.readVLong();
}
if (fieldHasPositions) {
termState.posStartFP += in.readVLong();
if (fieldHasOffsets || fieldHasPayloads) {
termState.payStartFP += in.readVLong();
}
}
if (version < VERSION_COMPRESSED_TERMS_DICT_IDS) {
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
} else {
termState.singletonDocID = -1;
}
}
if (fieldHasPositions) {
if (termState.totalTermFreq > BLOCK_SIZE) {
termState.lastPosBlockOffset = in.readVLong();
@ -203,6 +224,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
} else {
termState.skipOffset = -1;
}
}
@Override

View File

@ -41,6 +41,7 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -461,17 +462,28 @@ public final class Lucene84PostingsWriter extends PushPostingsWriterBase {
IntBlockTermState state = (IntBlockTermState)_state;
if (absolute) {
lastState = emptyState;
assert lastState.docStartFP == 0;
}
out.writeVLong(state.docStartFP - lastState.docStartFP);
if (lastState.singletonDocID != -1 && state.singletonDocID != -1 && state.docStartFP == lastState.docStartFP) {
// With runs of rare values such as ID fields, the increment of pointers in the docs file is often 0.
// Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we encode the delta
// between consecutive doc IDs to save space.
final long delta = (long) state.singletonDocID - lastState.singletonDocID;
out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
} else {
out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
if (state.singletonDocID != -1) {
out.writeVInt(state.singletonDocID);
}
}
if (writePositions) {
out.writeVLong(state.posStartFP - lastState.posStartFP);
if (writePayloads || writeOffsets) {
out.writeVLong(state.payStartFP - lastState.payStartFP);
}
}
if (state.singletonDocID != -1) {
out.writeVInt(state.singletonDocID);
}
if (writePositions) {
if (state.lastPosBlockOffset != -1) {
out.writeVLong(state.lastPosBlockOffset);

View File

@ -0,0 +1,443 @@
/*
* LZ4 Library
* Copyright (c) 2011-2016, Yann Collet
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
/**
* LZ4 compression and decompression routines.
*
* https://github.com/lz4/lz4/tree/dev/lib
* http://fastcompression.blogspot.fr/p/lz4.html
*
* The high-compression option is a simpler version of the one of the original
* algorithm, and only retains a better hash table that remembers about more
* occurrences of a previous 4-bytes sequence, and removes all the logic about
* handling of the case when overlapping matches are found.
*/
public final class LZ4 {
private LZ4() {}
static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
static final int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
private static int hash(int i, int hashBits) {
return (i * -1640531535) >>> (32 - hashBits);
}
private static int hashHC(int i) {
return hash(i, HASH_LOG_HC);
}
private static int readInt(byte[] buf, int i) {
return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF);
}
private static int commonBytes(byte[] b, int o1, int o2, int limit) {
assert o1 < o2;
// never -1 because lengths always differ
return Arrays.mismatch(b, o1, limit, b, o2, limit);
}
/**
* Decompress at least <code>decompressedLen</code> bytes into
* <code>dest[dOff:]</code>. Please note that <code>dest</code> must be large
* enough to be able to hold <b>all</b> decompressed data (meaning that you
* need to know the total decompressed length).
*/
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException {
final int destEnd = dest.length;
do {
// literals
final int token = compressed.readByte() & 0xFF;
int literalLen = token >>> 4;
if (literalLen != 0) {
if (literalLen == 0x0F) {
byte len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
literalLen += 0xFF;
}
literalLen += len & 0xFF;
}
compressed.readBytes(dest, dOff, literalLen);
dOff += literalLen;
}
if (dOff >= decompressedLen) {
break;
}
// matchs
final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8);
assert matchDec > 0;
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
int len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
matchLen += 0xFF;
}
matchLen += len & 0xFF;
}
matchLen += MIN_MATCH;
// copying a multiple of 8 bytes can make decompression from 5% to 10% faster
final int fastLen = (matchLen + 7) & 0xFFFFFFF8;
if (matchDec < matchLen || dOff + fastLen > destEnd) {
// overlap -> naive incremental copy
for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) {
dest[dOff] = dest[ref];
}
} else {
// no overlap -> arraycopy
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
dOff += matchLen;
}
} while (dOff < decompressedLen);
return dOff;
}
private static void encodeLen(int l, DataOutput out) throws IOException {
while (l >= 0xFF) {
out.writeByte((byte) 0xFF);
l -= 0xFF;
}
out.writeByte((byte) l);
}
private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException {
out.writeByte((byte) token);
// encode literal length
if (literalLen >= 0x0F) {
encodeLen(literalLen - 0x0F, out);
}
// encode literals
out.writeBytes(bytes, anchor, literalLen);
}
private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException {
final int token = Math.min(literalLen, 0x0F) << 4;
encodeLiterals(bytes, token, anchor, literalLen, out);
}
private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException {
final int literalLen = matchOff - anchor;
assert matchLen >= 4;
// encode token
final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F);
encodeLiterals(bytes, token, anchor, literalLen, out);
// encode match dec
final int matchDec = matchOff - matchRef;
assert matchDec > 0 && matchDec < 1 << 16;
out.writeByte((byte) matchDec);
out.writeByte((byte) (matchDec >>> 8));
// encode match len
if (matchLen >= MIN_MATCH + 0x0F) {
encodeLen(matchLen - 0x0F - MIN_MATCH, out);
}
}
/**
* A record of previous occurrences of sequences of 4 bytes.
*/
static abstract class HashTable {
/** Reset this hash table in order to compress the given content. */
abstract void reset(byte[] b, int off, int len);
/**
* Advance the cursor to {@off} and return an index that stored the same
* 4 bytes as {@code b[o:o+4)}. This may only be called on strictly
* increasing sequences of offsets. A return value of {@code -1} indicates
* that no other index could be found. */
abstract int get(int off);
/**
* Return an index that less than {@code off} and stores the same 4
* bytes. Unlike {@link #get}, it doesn't need to be called on increasing
* offsets. A return value of {@code -1} indicates that no other index could
* be found. */
abstract int previous(int off);
}
/**
* Simple lossy {@link HashTable} that only stores the last ocurrence for
* each hash on {@code 2^14} bytes of memory.
*/
public static final class FastCompressionHashTable extends HashTable {
private byte[] bytes;
private int base;
private int lastOff;
private int end;
private int hashLog;
private PackedInts.Mutable hashTable;
/** Sole constructor */
public FastCompressionHashTable() {}
@Override
void reset(byte[] bytes, int off, int len) {
Objects.checkFromIndexSize(off, len, bytes.length);
this.bytes = bytes;
this.base = off;
this.lastOff = off - 1;
this.end = off + len;
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
if (hashTable == null || hashTable.size() < 1 << hashLog || hashTable.getBitsPerValue() < bitsPerOffset) {
hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT);
} else {
// Avoid calling hashTable.clear(), this makes it costly to compress many short sequences otherwise.
// Instead, get() checks that references are less than the current offset.
get(off); // this sets the hashTable for the first 4 bytes as a side-effect
}
}
@Override
int get(int off) {
assert off > lastOff;
assert off < end;
final int v = readInt(bytes, off);
final int h = hash(v, hashLog);
final int ref = base + (int) hashTable.get(h);
hashTable.set(h, off - base);
lastOff = off;
if (ref < off && off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) {
return ref;
} else {
return -1;
}
}
@Override
public int previous(int off) {
return -1;
}
}
/**
* A higher-precision {@link HashTable}. It stores up to 256 occurrences of
* 4-bytes sequences in the last {@code 2^16} bytes, which makes it much more
* likely to find matches than {@link FastCompressionHashTable}.
*/
public static final class HighCompressionHashTable extends HashTable {
private static final int MAX_ATTEMPTS = 256;
private static final int MASK = MAX_DISTANCE - 1;
private byte[] bytes;
private int base;
private int next;
private int end;
private final int[] hashTable;
private final short[] chainTable;
private int attempts = 0;
/** Sole constructor */
public HighCompressionHashTable() {
hashTable = new int[HASH_TABLE_SIZE_HC];
Arrays.fill(hashTable, -1);
chainTable = new short[MAX_DISTANCE];
Arrays.fill(chainTable, (short) 0xFFFF);
}
private boolean assertReset() {
for (int i = 0; i < chainTable.length; ++i) {
assert chainTable[i] == (short) 0xFFFF : i;
}
return true;
}
@Override
void reset(byte[] bytes, int off, int len) {
Objects.checkFromIndexSize(off, len, bytes.length);
if (end - base < chainTable.length) {
// The last call to compress was done on less than 64kB, let's not reset
// the hashTable and only reset the relevant parts of the chainTable.
// This helps avoid slowing down calling compress() many times on short
// inputs.
int startOffset = base & MASK;
int endOffset = end == 0 ? 0 : ((end - 1) & MASK) + 1;
if (startOffset < endOffset) {
Arrays.fill(chainTable, startOffset, endOffset, (short) 0xFFFF);
} else {
Arrays.fill(chainTable, 0, endOffset, (short) 0xFFFF);
Arrays.fill(chainTable, startOffset, chainTable.length, (short) 0xFFFF);
}
} else {
// The last call to compress was done on a large enough amount of data
// that it's fine to reset both tables
Arrays.fill(hashTable, -1);
Arrays.fill(chainTable, (short) 0xFFFF);
}
assert assertReset();
this.bytes = bytes;
this.base = off;
this.next = off;
this.end = off + len;
}
@Override
int get(int off) {
assert off > next;
assert off < end;
for (; next < off; next++) {
addHash(next);
}
final int v = readInt(bytes, off);
final int h = hashHC(v);
attempts = 0;
int ref = hashTable[h];
if (ref >= off) {
// remainder from a previous call to compress()
return -1;
}
for (int min = Math.max(base, off - MAX_DISTANCE + 1);
ref >= min && attempts < MAX_ATTEMPTS;
ref -= chainTable[ref & MASK] & 0xFFFF, attempts++) {
if (readInt(bytes, ref) == v) {
return ref;
}
}
return -1;
}
private void addHash(int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
int delta = off - hashTable[h];
if (delta <= 0 || delta >= MAX_DISTANCE) {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
hashTable[h] = off;
}
@Override
int previous(int off) {
final int v = readInt(bytes, off);
for (int ref = off - (chainTable[off & MASK] & 0xFFFF);
ref >= base && attempts < MAX_ATTEMPTS;
ref -= chainTable[ref & MASK] & 0xFFFF, attempts++ ) {
if (readInt(bytes, ref) == v) {
return ref;
}
}
return -1;
}
}
/**
* Compress <code>bytes[off:off+len]</code> into <code>out</code> using
* at most 16KB of memory. <code>ht</code> shouldn't be shared across threads
* but can safely be reused.
*/
public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException {
Objects.checkFromIndexSize(off, len, bytes.length);
final int base = off;
final int end = off + len;
int anchor = off++;
if (len > LAST_LITERALS + MIN_MATCH) {
final int limit = end - LAST_LITERALS;
final int matchLimit = limit - MIN_MATCH;
ht.reset(bytes, base, len);
main:
while (off <= limit) {
// find a match
int ref;
while (true) {
if (off >= matchLimit) {
break main;
}
ref = ht.get(off);
if (ref != -1) {
assert ref >= base && ref < off;
assert readInt(bytes, ref) == readInt(bytes, off);
break;
}
++off;
}
// compute match length
int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit);
// try to find a better match
for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, base); r >= min; r = ht.previous(r)) {
assert readInt(bytes, r) == readInt(bytes, off);
int rMatchLen = MIN_MATCH + commonBytes(bytes, r + MIN_MATCH, off + MIN_MATCH, limit);
if (rMatchLen > matchLen) {
ref = r;
matchLen = rMatchLen;
}
}
encodeSequence(bytes, anchor, ref, off, matchLen, out);
off += matchLen;
anchor = off;
}
}
// last literals
final int literalLen = end - anchor;
assert literalLen >= LAST_LITERALS || literalLen == len;
encodeLastLiterals(bytes, anchor, end - anchor, out);
}
}

View File

@ -0,0 +1,152 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
/**
* Utility class that can efficiently compress arrays that mostly contain
* characters in the [0x1F,0x3F) or [0x5F,0x7F) ranges, which notably
* include all digits, lowercase characters, '.', '-' and '_'.
*/
public final class LowercaseAsciiCompression {
private static final boolean isCompressible(int b) {
final int high3Bits = (b + 1) & ~0x1F;
return high3Bits == 0x20 || high3Bits == 0x60;
}
private LowercaseAsciiCompression() {}
/**
* Compress {@code in[0:len]} into {@code out}.
* This returns {@code false} if the content cannot be compressed. The number
* of bytes written is guaranteed to be less than {@code len} otherwise.
*/
public static boolean compress(byte[] in, int len, byte[] tmp, DataOutput out) throws IOException {
if (len < 8) {
return false;
}
// 1. Count exceptions and fail compression if there are too many of them.
final int maxExceptions = len >>> 5;
int previousExceptionIndex = 0;
int numExceptions = 0;
for (int i = 0; i < len; ++i) {
final int b = in[i] & 0xFF;
if (isCompressible(b) == false) {
while (i - previousExceptionIndex > 0xFF) {
++numExceptions;
previousExceptionIndex += 0xFF;
}
if (++numExceptions > maxExceptions) {
return false;
}
previousExceptionIndex = i;
}
}
assert numExceptions <= maxExceptions;
// 2. Now move all bytes to the [0,0x40) range (6 bits). This loop gets auto-vectorized on JDK13+.
final int compressedLen = len - (len >>> 2); // ignores exceptions
assert compressedLen < len;
for (int i = 0; i < len; ++i) {
int b = (in[i] & 0xFF) + 1;
tmp[i] = (byte) ((b & 0x1F) | ((b & 0x40) >>> 1));
}
// 3. Now pack the bytes so that we record 4 ASCII chars in 3 bytes
int o = 0;
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x30) << 2; // bits 4-5
}
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x0C) << 4; // bits 2-3
}
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x03) << 6; // bits 0-1
}
assert o <= compressedLen;
out.writeBytes(tmp, 0, compressedLen);
// 4. Finally record exceptions
out.writeVInt(numExceptions);
if (numExceptions > 0) {
previousExceptionIndex = 0;
int numExceptions2 = 0;
for (int i = 0; i < len; ++i) {
int b = in[i] & 0xFF;
if (isCompressible(b) == false) {
while (i - previousExceptionIndex > 0xFF) {
// We record deltas between exceptions as bytes, so we need to create
// "artificial" exceptions if the delta between two of them is greater
// than the maximum unsigned byte value.
out.writeByte((byte) 0xFF);
previousExceptionIndex += 0xFF;
out.writeByte(in[previousExceptionIndex]);
numExceptions2++;
}
out.writeByte((byte) (i - previousExceptionIndex));
previousExceptionIndex = i;
out.writeByte((byte) b);
numExceptions2++;
}
}
if (numExceptions != numExceptions2) {
throw new IllegalStateException("" + numExceptions + " <> " + numExceptions2 + " " + new BytesRef(in, 0, len).utf8ToString());
}
}
return true;
}
/**
* Decompress data that has been compressed with {@link #compress(byte[], int, byte[], DataOutput)}.
* {@code len} must be the original length, not the compressed length.
*/
public static void decompress(DataInput in, byte[] out, int len) throws IOException {
final int saved = len >>> 2;
int compressedLen = len - saved;
// 1. Copy the packed bytes
in.readBytes(out, 0, compressedLen);
// 2. Restore the leading 2 bits of each packed byte into whole bytes
for (int i = 0; i < saved; ++i) {
out[compressedLen + i] = (byte) (((out[i] & 0xC0) >>> 2) | ((out[saved + i] & 0xC0) >>> 4) | ((out[(saved<<1) + i] & 0xC0) >>> 6));
}
// 3. Move back to the original range. This loop gets auto-vectorized on JDK13+.
for (int i = 0; i < len; ++i) {
final byte b = out[i];
out[i] = (byte) (((b & 0x1F) | 0x20 | ((b & 0x20) << 1)) - 1);
}
// 4. Restore exceptions
final int numExceptions = in.readVInt();
int i = 0;
for (int exception = 0; exception < numExceptions; ++exception) {
i += in.readByte() & 0xFF;
out[i] = in.readByte();
}
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Compression utilities.
*/
package org.apache.lucene.util.compress;

View File

@ -144,58 +144,4 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
test(decompressed);
}
public void testLUCENE5201() throws IOException {
byte[] data = new byte[]{
14, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 72, 14, 72, 14, 85, 3, 72, 14, 72, 14, 72, 14, 72, 14, 72, 14, 72, 14, 85, 3, 72,
14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 50, 64, 0, 46, -1, 0, 0, 0, 29, 3, 85,
8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 50, 64, 0, 47, -105, 0, 0, 0, 30, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85,
8, -113, 0, 68, -97, 3, 0, 2, -97, 6, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 120, 64, 0, 48, 4, 0, 0, 0, 31, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72,
33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72,
43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72,
28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72,
35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72,
41, 72, 32, 72, 18, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 39, 24, 32, 34, 124, 0, 120, 64, 0, 48, 80, 0, 0, 0, 31, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72,
35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72,
41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72,
40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72,
31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72,
26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72,
37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72,
36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72,
20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72,
22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72,
38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72,
29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72,
27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 50, 64, 0, 49, 20, 0, 0, 0, 32, 3, -97, 6, 0,
68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2,
3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2,
3, -97, 6, 0, 50, 64, 0, 50, 53, 0, 0, 0, 34, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3,
-97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97,
3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3,
85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0,
2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3,
-97, 6, 0, 50, 64, 0, 51, 85, 0, 0, 0, 36, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, -97, 5, 0, 2, 3, 85, 8, -113, 0, 68,
-97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0,
68, -113, 0, 2, 3, -97, 6, 0, 50, -64, 0, 51, -45, 0, 0, 0, 37, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6,
0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 120, 64, 0, 52, -88, 0, 0,
0, 39, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72,
13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 72,
13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, -19, -24, -101, -35
};
test(data, 9, data.length - 9);
}
}

View File

@ -1,112 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
public abstract class AbstractTestLZ4CompressionMode extends AbstractTestCompressionMode {
@Override
public byte[] test(byte[] decompressed) throws IOException {
final byte[] compressed = super.test(decompressed);
int off = 0;
int decompressedOff = 0;
for (;;) {
final int token = compressed[off++] & 0xFF;
int literalLen = token >>> 4;
if (literalLen == 0x0F) {
while (compressed[off] == (byte) 0xFF) {
literalLen += 0xFF;
++off;
}
literalLen += compressed[off++] & 0xFF;
}
// skip literals
off += literalLen;
decompressedOff += literalLen;
// check that the stream ends with literals and that there are at least
// 5 of them
if (off == compressed.length) {
assertEquals(decompressed.length, decompressedOff);
assertTrue("lastLiterals=" + literalLen + ", bytes=" + decompressed.length,
literalLen >= LZ4.LAST_LITERALS || literalLen == decompressed.length);
break;
}
final int matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8);
// check that match dec is not 0
assertTrue(matchDec + " " + decompressedOff, matchDec > 0 && matchDec <= decompressedOff);
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
while (compressed[off] == (byte) 0xFF) {
matchLen += 0xFF;
++off;
}
matchLen += compressed[off++] & 0xFF;
}
matchLen += LZ4.MIN_MATCH;
// if the match ends prematurely, the next sequence should not have
// literals or this means we are wasting space
if (decompressedOff + matchLen < decompressed.length - LZ4.LAST_LITERALS) {
final boolean moreCommonBytes = decompressed[decompressedOff + matchLen] == decompressed[decompressedOff - matchDec + matchLen];
final boolean nextSequenceHasLiterals = ((compressed[off] & 0xFF) >>> 4) != 0;
assertTrue(!moreCommonBytes || !nextSequenceHasLiterals);
}
decompressedOff += matchLen;
}
assertEquals(decompressed.length, decompressedOff);
return compressed;
}
public void testShortLiteralsAndMatchs() throws IOException {
// literals and matchs lengths <= 15
final byte[] decompressed = "1234562345673456745678910123".getBytes(StandardCharsets.UTF_8);
test(decompressed);
}
public void testLongMatchs() throws IOException {
// match length >= 20
final byte[] decompressed = new byte[RandomNumbers.randomIntBetween(random(), 300, 1024)];
for (int i = 0; i < decompressed.length; ++i) {
decompressed[i] = (byte) i;
}
test(decompressed);
}
public void testLongLiterals() throws IOException {
// long literals (length >= 16) which are not the last literals
final byte[] decompressed = randomArray(RandomNumbers.randomIntBetween(random(), 400, 1024), 256);
final int matchRef = random().nextInt(30);
final int matchOff = RandomNumbers.randomIntBetween(random(), decompressed.length - 40, decompressed.length - 20);
final int matchLength = RandomNumbers.randomIntBetween(random(), 4, 10);
System.arraycopy(decompressed, matchRef, decompressed, matchOff, matchLength);
test(decompressed);
}
public void testMatchRightBeforeLastLiterals() throws IOException {
test(new byte[] {1,2,3,4, 1,2,3,4, 1,2,3,4,5});
}
}

View File

@ -17,7 +17,7 @@
package org.apache.lucene.codecs.compressing;
public class TestFastCompressionMode extends AbstractTestLZ4CompressionMode {
public class TestFastCompressionMode extends AbstractTestCompressionMode {
@Override
public void setUp() throws Exception {

View File

@ -16,10 +16,7 @@
*/
package org.apache.lucene.codecs.compressing;
import java.io.IOException;
public class TestFastDecompressionMode extends AbstractTestLZ4CompressionMode {
public class TestFastDecompressionMode extends AbstractTestCompressionMode {
@Override
public void setUp() throws Exception {
@ -27,14 +24,4 @@ public class TestFastDecompressionMode extends AbstractTestLZ4CompressionMode {
mode = CompressionMode.FAST_DECOMPRESSION;
}
@Override
public byte[] test(byte[] decompressed, int off, int len) throws IOException {
final byte[] compressed = super.test(decompressed, off, len);
final byte[] compressed2 = compress(CompressionMode.FAST.newCompressor(), decompressed, off, len);
// because of the way this compression mode works, its output is necessarily
// smaller than the output of CompressionMode.FAST
assertTrue(compressed.length <= compressed2.length);
return compressed;
}
}

View File

@ -0,0 +1,210 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
public abstract class LZ4TestCase extends LuceneTestCase {
protected abstract LZ4.HashTable newHashTable();
private void doTest(byte[] data, LZ4.HashTable hashTable) throws IOException {
int offset = random().nextBoolean()
? random().nextInt(10)
: (1<<16) - data.length / 2; // this triggers special reset logic for high compression
byte[] copy = new byte[data.length + offset + random().nextInt(10)];
System.arraycopy(data, 0, copy, offset, data.length);
doTest(copy, offset, data.length, hashTable);
}
private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable) throws IOException {
ByteBuffersDataOutput out = new ByteBuffersDataOutput();
LZ4.compress(data, offset, length, out, hashTable);
byte[] compressed = out.toArrayCopy();
int off = 0;
int decompressedOff = 0;
for (;;) {
final int token = compressed[off++] & 0xFF;
int literalLen = token >>> 4;
if (literalLen == 0x0F) {
while (compressed[off] == (byte) 0xFF) {
literalLen += 0xFF;
++off;
}
literalLen += compressed[off++] & 0xFF;
}
// skip literals
off += literalLen;
decompressedOff += literalLen;
// check that the stream ends with literals and that there are at least
// 5 of them
if (off == compressed.length) {
assertEquals(length, decompressedOff);
assertTrue("lastLiterals=" + literalLen + ", bytes=" + length,
literalLen >= LZ4.LAST_LITERALS || literalLen == length);
break;
}
final int matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8);
// check that match dec is not 0
assertTrue(matchDec + " " + decompressedOff, matchDec > 0 && matchDec <= decompressedOff);
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
while (compressed[off] == (byte) 0xFF) {
matchLen += 0xFF;
++off;
}
matchLen += compressed[off++] & 0xFF;
}
matchLen += LZ4.MIN_MATCH;
// if the match ends prematurely, the next sequence should not have
// literals or this means we are wasting space
if (decompressedOff + matchLen < length - LZ4.LAST_LITERALS) {
final boolean moreCommonBytes = data[offset + decompressedOff + matchLen] == data[offset + decompressedOff - matchDec + matchLen];
final boolean nextSequenceHasLiterals = ((compressed[off] & 0xFF) >>> 4) != 0;
assertTrue(moreCommonBytes == false || nextSequenceHasLiterals == false);
}
decompressedOff += matchLen;
}
assertEquals(length, decompressedOff);
// Compress once again with the same hash table to test reuse
ByteBuffersDataOutput out2 = new ByteBuffersDataOutput();
LZ4.compress(data, offset, length, out2, hashTable);
assertArrayEquals(compressed, out2.toArrayCopy());
}
public void testEmpty() throws IOException {
// literals and matchs lengths <= 15
final byte[] data = "".getBytes(StandardCharsets.UTF_8);
doTest(data, newHashTable());
}
public void testShortLiteralsAndMatchs() throws IOException {
// literals and matchs lengths <= 15
final byte[] data = "1234562345673456745678910123".getBytes(StandardCharsets.UTF_8);
doTest(data, newHashTable());
}
public void testLongMatchs() throws IOException {
// match length >= 20
final byte[] data = new byte[RandomNumbers.randomIntBetween(random(), 300, 1024)];
for (int i = 0; i < data.length; ++i) {
data[i] = (byte) i;
}
doTest(data, newHashTable());
}
public void testLongLiterals() throws IOException {
// long literals (length >= 16) which are not the last literals
final byte[] data = new byte[RandomNumbers.randomIntBetween(random(), 400, 1024)];
random().nextBytes(data);
final int matchRef = random().nextInt(30);
final int matchOff = RandomNumbers.randomIntBetween(random(), data.length - 40, data.length - 20);
final int matchLength = RandomNumbers.randomIntBetween(random(), 4, 10);
System.arraycopy(data, matchRef, data, matchOff, matchLength);
doTest(data, newHashTable());
}
public void testMatchRightBeforeLastLiterals() throws IOException {
doTest(new byte[] {1,2,3,4, 1,2,3,4, 1,2,3,4,5}, newHashTable());
}
public void testIncompressibleRandom() throws IOException {
byte[] b = new byte[TestUtil.nextInt(random(), 1, 1 << 32)];
random().nextBytes(b);
doTest(b, newHashTable());
}
public void testCompressibleRandom() throws IOException {
byte[] b = new byte[TestUtil.nextInt(random(), 1, 1 << 32)];
final int base = random().nextInt(256);
final int maxDelta = 1 + random().nextInt(8);
Random r = random();
for (int i = 0; i < b.length; ++i) {
b[i] = (byte) (base + r.nextInt(maxDelta));
}
doTest(b, newHashTable());
}
public void testLUCENE5201() throws IOException {
byte[] data = new byte[]{
14, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 72, 14, 72, 14, 85, 3, 72, 14, 72, 14, 72, 14, 72, 14, 72, 14, 72, 14, 85, 3, 72,
14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 85, 3, 72, 14, 50, 64, 0, 46, -1, 0, 0, 0, 29, 3, 85,
8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 50, 64, 0, 47, -105, 0, 0, 0, 30, 3, -97, 6, 0, 68, -113,
0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85,
8, -113, 0, 68, -97, 3, 0, 2, -97, 6, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 120, 64, 0, 48, 4, 0, 0, 0, 31, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72,
33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72,
43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72,
28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72,
35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72,
41, 72, 32, 72, 18, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 39, 24, 32, 34, 124, 0, 120, 64, 0, 48, 80, 0, 0, 0, 31, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72,
35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72,
41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72,
40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72,
31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72,
26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72,
37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72,
36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72,
20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72,
22, 72, 31, 72, 43, 72, 19, 72, 34, 72, 29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72,
38, 72, 26, 72, 28, 72, 42, 72, 24, 72, 27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 72, 34, 72,
29, 72, 37, 72, 35, 72, 45, 72, 23, 72, 46, 72, 20, 72, 40, 72, 33, 72, 25, 72, 39, 72, 38, 72, 26, 72, 28, 72, 42, 72, 24, 72,
27, 72, 36, 72, 41, 72, 32, 72, 18, 72, 30, 72, 22, 72, 31, 72, 43, 72, 19, 50, 64, 0, 49, 20, 0, 0, 0, 32, 3, -97, 6, 0,
68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2,
3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2,
3, -97, 6, 0, 50, 64, 0, 50, 53, 0, 0, 0, 34, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3,
-97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97,
3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3,
85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0,
2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3,
-97, 6, 0, 50, 64, 0, 51, 85, 0, 0, 0, 36, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, -97, 5, 0, 2, 3, 85, 8, -113, 0, 68,
-97, 3, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0,
68, -113, 0, 2, 3, -97, 6, 0, 50, -64, 0, 51, -45, 0, 0, 0, 37, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6,
0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, -97, 6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -113, 0, 2, 3, -97,
6, 0, 68, -113, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 2, 3, 85, 8, -113, 0, 68, -97, 3, 0, 120, 64, 0, 52, -88, 0, 0,
0, 39, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72,
13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85,
5, 72, 13, 85, 5, 72, 13, 72, 13, 72, 13, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, 72,
13, 85, 5, 72, 13, 72, 13, 85, 5, 72, 13, -19, -24, -101, -35
};
doTest(data, 9, data.length - 9, newHashTable());
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import org.apache.lucene.util.compress.LZ4.HashTable;
public class TestFastLZ4 extends LZ4TestCase {
@Override
protected HashTable newHashTable() {
return new LZ4.FastCompressionHashTable();
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import org.apache.lucene.util.compress.LZ4.HashTable;
public class TestHighLZ4 extends LZ4TestCase {
@Override
protected HashTable newHashTable() {
return new LZ4.HighCompressionHashTable();
}
}

View File

@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestLowercaseAsciiCompression extends LuceneTestCase {
private boolean doTestCompress(byte[] bytes) throws IOException {
return doTestCompress(bytes, bytes.length);
}
private boolean doTestCompress(byte[] bytes, int len) throws IOException {
ByteBuffersDataOutput compressed = new ByteBuffersDataOutput();
byte[] tmp = new byte[len + random().nextInt(10)];
random().nextBytes(tmp);
if (LowercaseAsciiCompression.compress(bytes, len, tmp, compressed)) {
assertTrue(compressed.size() < len);
byte[] restored = new byte[len + random().nextInt(10)];
LowercaseAsciiCompression.decompress(compressed.toDataInput(), restored, len);
assertArrayEquals(ArrayUtil.copyOfSubArray(bytes, 0, len), ArrayUtil.copyOfSubArray(restored, 0, len));
return true;
} else {
return false;
}
}
public void testSimple() throws Exception {
assertFalse(doTestCompress("".getBytes("UTF-8"))); // too short
assertFalse(doTestCompress("ab1".getBytes("UTF-8"))); // too short
assertFalse(doTestCompress("ab1cdef".getBytes("UTF-8"))); // too short
assertTrue(doTestCompress("ab1cdefg".getBytes("UTF-8")));
assertFalse(doTestCompress("ab1cdEfg".getBytes("UTF-8"))); // too many exceptions
assertTrue(doTestCompress("ab1cdefg".getBytes("UTF-8")));
// 1 exception, but enough chars to be worth encoding an exception
assertTrue(doTestCompress("ab1.dEfg427hiogchio:'nwm un!94twxz".getBytes("UTF-8")));
}
public void testFarAwayExceptions() throws Exception {
String s = "01W" + IntStream.range(0, 300).mapToObj(i -> "a").collect(Collectors.joining()) + "W.";
assertTrue(doTestCompress(s.getBytes("UTF-8")));
}
public void testRandomAscii() throws IOException {
for (int iter = 0; iter < 1000; ++iter) {
int len = random().nextInt(1000);
byte[] bytes = new byte[len + random().nextInt(10)];
for (int i = 0; i < bytes.length; ++i) {
bytes[i] = (byte) TestUtil.nextInt(random(), ' ', '~');
}
doTestCompress(bytes, len);
}
}
public void testRandomCompressibleAscii() throws IOException {
for (int iter = 0; iter < 1000; ++iter) {
int len = TestUtil.nextInt(random(), 8, 1000);
byte[] bytes = new byte[len + random().nextInt(10)];
for (int i = 0; i < bytes.length; ++i) {
// only use always compressible bytes
int b = random().nextInt(32);
b = b | 0x20 | ((b & 0x20) << 1);
b -= 1;
bytes[i] = (byte) b;
}
assertTrue(doTestCompress(bytes, len));
}
}
public void testRandomCompressibleAsciiWithExceptions() throws IOException {
for (int iter = 0; iter < 1000; ++iter) {
int len = TestUtil.nextInt(random(), 8, 1000);
int exceptions = 0;
int maxExceptions = len >>> 5;
byte[] bytes = new byte[len + random().nextInt(10)];
for (int i = 0; i < bytes.length; ++i) {
if (exceptions == maxExceptions || random().nextInt(100) != 0) {
int b = random().nextInt(32);
b = b | 0x20 | ((b & 0x20) << 1);
b -= 1;
bytes[i] = (byte) b;
} else {
exceptions++;
bytes[i] = (byte) random().nextInt(256);
}
}
assertTrue(doTestCompress(bytes, len));
}
}
public void testRandom() throws IOException {
for (int iter = 0; iter < 1000; ++iter) {
int len = random().nextInt(1000);
byte[] bytes = new byte[len + random().nextInt(10)];
random().nextBytes(bytes);
doTestCompress(bytes, len);
}
}
}