Fix compression bug on highly compressible inputs with LZ4.compressHC.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520060 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-09-04 16:11:36 +00:00
parent 7e7cf0961f
commit 5fa6cd3fec
3 changed files with 48 additions and 22 deletions

View File

@ -219,7 +219,7 @@ final class LZ4 {
final PackedInts.Mutable hashTable = ht.hashTable;
main:
while (off < limit) {
while (off <= limit) {
// find a match
int ref;
while (true) {
@ -299,7 +299,7 @@ final class LZ4 {
}
private int next(int off) {
return base + off - (chainTable[off & MASK] & 0xFFFF);
return off - (chainTable[off & MASK] & 0xFFFF);
}
private void addHash(byte[] bytes, int off) {
@ -310,7 +310,7 @@ final class LZ4 {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
hashTable[h] = off - base;
hashTable[h] = off;
}
void insert(int off, byte[] bytes) {
@ -322,12 +322,24 @@ final class LZ4 {
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
match.start = off;
match.len = 0;
int delta = 0;
int repl = 0;
insert(off, buf);
int ref = hashPointer(buf, off);
if (ref >= off - 4 && ref <= off && ref >= base) { // potential repetition
if (readIntEquals(buf, ref, off)) { // confirmed
delta = off - ref;
repl = match.len = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
match.ref = ref;
}
ref = next(ref);
}
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
@ -340,6 +352,21 @@ final class LZ4 {
ref = next(ref);
}
if (repl != 0) {
int ptr = off;
final int end = off + repl - (MIN_MATCH - 1);
while (ptr < end - delta) {
chainTable[ptr & MASK] = (short) delta; // pre load
++ptr;
}
do {
chainTable[ptr & MASK] = (short) delta;
hashTable[hashHC(readInt(buf, ptr))] = ptr;
++ptr;
} while (ptr < end);
nextToUpdate = end;
}
return match.len != 0;
}
@ -351,7 +378,7 @@ final class LZ4 {
final int delta = off - startLimit;
int ref = hashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
@ -386,6 +413,7 @@ final class LZ4 {
final int srcEnd = srcOff + srcLen;
final int matchLimit = srcEnd - LAST_LITERALS;
final int mfLimit = matchLimit - MIN_MATCH;
int sOff = srcOff;
int anchor = sOff++;
@ -397,7 +425,7 @@ final class LZ4 {
final Match match3 = new Match();
main:
while (sOff < matchLimit) {
while (sOff <= mfLimit) {
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
++sOff;
continue;
@ -409,7 +437,7 @@ final class LZ4 {
search2:
while (true) {
assert match1.start >= anchor;
if (match1.end() >= matchLimit
if (match1.end() >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
// no better match
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
@ -445,25 +473,12 @@ final class LZ4 {
}
}
if (match2.start + match2.len >= matchLimit
if (match2.start + match2.len >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
// no better match -> 2 sequences to encode
if (match2.start < match1.end()) {
if (match2.start - match1.start < OPTIMAL_ML) {
if (match1.len > OPTIMAL_ML) {
match1.len = OPTIMAL_ML;
}
if (match1.end() > match2.end() - MIN_MATCH) {
match1.len = match2.end() - match1.start - MIN_MATCH;
}
final int correction = match1.len - (match2.start - match1.start);
if (correction > 0) {
match2.fix(correction);
}
} else {
match1.len = match2.start - match1.start;
}
}
// encode seq 1
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
anchor = sOff = match1.end();

View File

@ -24,6 +24,7 @@ import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
@ -130,4 +131,10 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
test(decompressed);
}
public void testConstant() throws IOException {
final byte[] decompressed = new byte[_TestUtil.nextInt(random(), 1, 10000)];
Arrays.fill(decompressed, (byte) random().nextInt());
test(decompressed);
}
}

View File

@ -104,4 +104,8 @@ public abstract class AbstractTestLZ4CompressionMode extends AbstractTestCompres
test(decompressed);
}
public void testMatchRightBeforeLastLiterals() throws IOException {
test(new byte[] {1,2,3,4, 1,2,3,4, 1,2,3,4,5});
}
}