mirror of https://github.com/apache/lucene.git
Fix compression bug on highly compressible inputs with LZ4.compressHC.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520060 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7e7cf0961f
commit
5fa6cd3fec
|
@ -219,7 +219,7 @@ final class LZ4 {
|
|||
final PackedInts.Mutable hashTable = ht.hashTable;
|
||||
|
||||
main:
|
||||
while (off < limit) {
|
||||
while (off <= limit) {
|
||||
// find a match
|
||||
int ref;
|
||||
while (true) {
|
||||
|
@ -299,7 +299,7 @@ final class LZ4 {
|
|||
}
|
||||
|
||||
private int next(int off) {
|
||||
return base + off - (chainTable[off & MASK] & 0xFFFF);
|
||||
return off - (chainTable[off & MASK] & 0xFFFF);
|
||||
}
|
||||
|
||||
private void addHash(byte[] bytes, int off) {
|
||||
|
@ -310,7 +310,7 @@ final class LZ4 {
|
|||
delta = MAX_DISTANCE - 1;
|
||||
}
|
||||
chainTable[off & MASK] = (short) delta;
|
||||
hashTable[h] = off - base;
|
||||
hashTable[h] = off;
|
||||
}
|
||||
|
||||
void insert(int off, byte[] bytes) {
|
||||
|
@ -322,12 +322,24 @@ final class LZ4 {
|
|||
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
|
||||
match.start = off;
|
||||
match.len = 0;
|
||||
int delta = 0;
|
||||
int repl = 0;
|
||||
|
||||
insert(off, buf);
|
||||
|
||||
int ref = hashPointer(buf, off);
|
||||
|
||||
if (ref >= off - 4 && ref <= off && ref >= base) { // potential repetition
|
||||
if (readIntEquals(buf, ref, off)) { // confirmed
|
||||
delta = off - ref;
|
||||
repl = match.len = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
|
||||
match.ref = ref;
|
||||
}
|
||||
ref = next(ref);
|
||||
}
|
||||
|
||||
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
|
||||
break;
|
||||
}
|
||||
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
|
||||
|
@ -340,6 +352,21 @@ final class LZ4 {
|
|||
ref = next(ref);
|
||||
}
|
||||
|
||||
if (repl != 0) {
|
||||
int ptr = off;
|
||||
final int end = off + repl - (MIN_MATCH - 1);
|
||||
while (ptr < end - delta) {
|
||||
chainTable[ptr & MASK] = (short) delta; // pre load
|
||||
++ptr;
|
||||
}
|
||||
do {
|
||||
chainTable[ptr & MASK] = (short) delta;
|
||||
hashTable[hashHC(readInt(buf, ptr))] = ptr;
|
||||
++ptr;
|
||||
} while (ptr < end);
|
||||
nextToUpdate = end;
|
||||
}
|
||||
|
||||
return match.len != 0;
|
||||
}
|
||||
|
||||
|
@ -351,7 +378,7 @@ final class LZ4 {
|
|||
final int delta = off - startLimit;
|
||||
int ref = hashPointer(buf, off);
|
||||
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
|
||||
if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
|
||||
break;
|
||||
}
|
||||
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
|
||||
|
@ -386,6 +413,7 @@ final class LZ4 {
|
|||
|
||||
final int srcEnd = srcOff + srcLen;
|
||||
final int matchLimit = srcEnd - LAST_LITERALS;
|
||||
final int mfLimit = matchLimit - MIN_MATCH;
|
||||
|
||||
int sOff = srcOff;
|
||||
int anchor = sOff++;
|
||||
|
@ -397,7 +425,7 @@ final class LZ4 {
|
|||
final Match match3 = new Match();
|
||||
|
||||
main:
|
||||
while (sOff < matchLimit) {
|
||||
while (sOff <= mfLimit) {
|
||||
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
|
||||
++sOff;
|
||||
continue;
|
||||
|
@ -409,7 +437,7 @@ final class LZ4 {
|
|||
search2:
|
||||
while (true) {
|
||||
assert match1.start >= anchor;
|
||||
if (match1.end() >= matchLimit
|
||||
if (match1.end() >= mfLimit
|
||||
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
|
||||
// no better match
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
|
@ -445,25 +473,12 @@ final class LZ4 {
|
|||
}
|
||||
}
|
||||
|
||||
if (match2.start + match2.len >= matchLimit
|
||||
if (match2.start + match2.len >= mfLimit
|
||||
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
|
||||
// no better match -> 2 sequences to encode
|
||||
if (match2.start < match1.end()) {
|
||||
if (match2.start - match1.start < OPTIMAL_ML) {
|
||||
if (match1.len > OPTIMAL_ML) {
|
||||
match1.len = OPTIMAL_ML;
|
||||
}
|
||||
if (match1.end() > match2.end() - MIN_MATCH) {
|
||||
match1.len = match2.end() - match1.start - MIN_MATCH;
|
||||
}
|
||||
final int correction = match1.len - (match2.start - match1.start);
|
||||
if (correction > 0) {
|
||||
match2.fix(correction);
|
||||
}
|
||||
} else {
|
||||
match1.len = match2.start - match1.start;
|
||||
}
|
||||
}
|
||||
// encode seq 1
|
||||
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
|
||||
anchor = sOff = match1.end();
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.store.ByteArrayDataInput;
|
|||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
|
@ -130,4 +131,10 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
|
|||
test(decompressed);
|
||||
}
|
||||
|
||||
public void testConstant() throws IOException {
|
||||
final byte[] decompressed = new byte[_TestUtil.nextInt(random(), 1, 10000)];
|
||||
Arrays.fill(decompressed, (byte) random().nextInt());
|
||||
test(decompressed);
|
||||
}
|
||||
|
||||
}
|
|
@ -104,4 +104,8 @@ public abstract class AbstractTestLZ4CompressionMode extends AbstractTestCompres
|
|||
test(decompressed);
|
||||
}
|
||||
|
||||
public void testMatchRightBeforeLastLiterals() throws IOException {
|
||||
test(new byte[] {1,2,3,4, 1,2,3,4, 1,2,3,4,5});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue