LUCENE-5675: zig-zag encode the versions (loses 1 bit); check the min/max version

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1596974 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-22 21:47:52 +00:00
parent 8a136e3113
commit da6a93f1ca
5 changed files with 87 additions and 14 deletions

View File

@ -58,6 +58,7 @@ final class FreqProxTermsWriter extends TermsHash {
if (termsEnum != null && termsEnum.seekExact(deleteTerm.bytes())) {
docsEnum = termsEnum.docs(null, docsEnum, 0);
int delDocLimit = segDeletes.get(deleteTerm);
assert delDocLimit < DocsEnum.NO_MORE_DOCS;
while (true) {
int doc = docsEnum.nextDoc();
if (doc < delDocLimit) {
@ -96,7 +97,7 @@ final class FreqProxTermsWriter extends TermsHash {
// Sort by field name
CollectionUtil.introSort(allFields);
FreqProxFields fields = new FreqProxFields(allFields);
Fields fields = new FreqProxFields(allFields);
applyDeletes(state, fields);

View File

@ -58,6 +58,14 @@ import org.apache.lucene.util.IOUtils;
public class IDVersionPostingsFormat extends PostingsFormat {
/** version must be >= this. */
public static final long MIN_VERSION = 0;
// TODO: we could delta encode instead, and keep the last bit:
/** version must be <= this, because we encode with ZigZag. */
public static final long MAX_VERSION = 0x3fffffffffffffffL;
private final int minTermsInBlock;
private final int maxTermsInBlock;
@ -121,6 +129,9 @@ public class IDVersionPostingsFormat extends PostingsFormat {
}
public static void longToBytes(long v, BytesRef bytes) {
if (v > MAX_VERSION || v < MIN_VERSION) {
throw new IllegalArgumentException("version must be >= MIN_VERSION=" + MIN_VERSION + " and <= MAX_VERSION=" + MAX_VERSION + " (got: " + v + ")");
}
bytes.offset = 0;
bytes.length = 8;
bytes.bytes[0] = (byte) (v >> 56);

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Bits;
final class IDVersionPostingsReader extends PostingsReaderBase {
@ -54,7 +55,11 @@ final class IDVersionPostingsReader extends PostingsReaderBase {
throws IOException {
final IDVersionTermState termState = (IDVersionTermState) _termState;
termState.docID = in.readVInt();
termState.idVersion = in.readVLong();
if (absolute) {
termState.idVersion = in.readVLong();
} else {
termState.idVersion += BitUtil.zigZagDecode(in.readVLong());
}
}
@Override

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
final class IDVersionPostingsWriter extends PushPostingsWriterBase {
@ -115,8 +116,11 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase {
}
lastVersion = IDVersionPostingsFormat.bytesToLong(payload);
if (lastVersion < 0) {
throw new IllegalArgumentException("version must be >= 0 (got: " + lastVersion + "; payload=" + payload + ")");
if (lastVersion < IDVersionPostingsFormat.MIN_VERSION) {
throw new IllegalArgumentException("version must be >= MIN_VERSION=" + IDVersionPostingsFormat.MIN_VERSION + " (got: " + lastVersion + "; payload=" + payload + ")");
}
if (lastVersion > IDVersionPostingsFormat.MAX_VERSION) {
throw new IllegalArgumentException("version must be <= MAX_VERSION=" + IDVersionPostingsFormat.MAX_VERSION + " (got: " + lastVersion + "; payload=" + payload + ")");
}
}
@ -143,12 +147,20 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase {
state.docID = lastDocID;
state.idVersion = lastVersion;
}
private long lastEncodedVersion;
@Override
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
IDVersionTermState state = (IDVersionTermState) _state;
out.writeVInt(state.docID);
out.writeVLong(state.idVersion);
if (absolute) {
out.writeVLong(state.idVersion);
} else {
long delta = state.idVersion - lastEncodedVersion;
out.writeVLong(BitUtil.zigZagEncode(delta));
}
lastEncodedVersion = state.idVersion;
}
@Override

View File

@ -29,7 +29,11 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.codecs.idversion.StringAndPayloadField.SingleTokenWithPayloadTokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -155,7 +159,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
int upto;
@Override
public String next() {
return Long.toString(random().nextLong() & 0x7ffffffffffffffL, radix);
return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix);
}
};
break;
@ -170,7 +174,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
int upto;
@Override
public String next() {
return Long.toString(random().nextLong() & 0x7ffffffffffffffL, radix);
return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix);
}
};
break;
@ -225,7 +229,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
if (useMonotonicVersion) {
version += TestUtil.nextInt(random(), 1, 10);
} else {
version = random().nextLong() & 0x7fffffffffffffffL;
version = random().nextLong() & 0x3fffffffffffffffL;
}
idValues.put(idValue, version);
if (VERBOSE) {
@ -243,7 +247,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
if (useMonotonicVersion) {
version += TestUtil.nextInt(random(), 1, 10);
} else {
version = random().nextLong() & 0x7fffffffffffffffL;
version = random().nextLong() & 0x3fffffffffffffffL;
}
doc = new Document();
doc.add(makeIDField(idValue, version));
@ -362,8 +366,8 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
w.addDocument(doc);
doc = new Document();
doc.add(makeIDField("id", 17));
w.addDocument(doc);
try {
w.addDocument(doc);
w.commit();
fail("didn't hit expected exception");
} catch (IllegalArgumentException iae) {
@ -445,7 +449,18 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
public void testMissingPayload() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
// MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload!
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
tokenizer.setEnableChecks(true);
MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
return new TokenStreamComponents(tokenizer, filt);
}
};
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, a);
iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
@ -567,6 +582,36 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
dir.close();
}
public void testInvalidVersions() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
// -1
doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff})));
try {
w.addDocument(doc);
w.commit();
fail("didn't hit expected exception");
} catch (IllegalArgumentException iae) {
// expected
}
doc = new Document();
// Long.MAX_VALUE:
doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0x7f, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff})));
try {
w.addDocument(doc);
w.commit();
fail("didn't hit expected exception");
} catch (IllegalArgumentException iae) {
// expected
}
w.close();
dir.close();
}
// Simulates optimistic concurrency in a distributed indexing app and confirms the latest version always wins:
public void testGlobalVersions() throws Exception {
Directory dir = newDirectory();
@ -576,7 +621,6 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
IDSource idsSource = getRandomIDs();
int numIDs = atLeast(100);
System.out.println("ids=" + numIDs);
if (VERBOSE) {
System.out.println("TEST: " + numIDs + " ids");
}
@ -649,7 +693,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase {
long newVersion;
if (versionType == 0) {
// Random:
newVersion = random().nextLong() & 0x7fffffffffffffffL;
newVersion = random().nextLong() & 0x3fffffffffffffffL;
} else if (versionType == 1) {
// Monotonic
newVersion = nextVersion.getAndIncrement();