From da6a93f1ca9088b7bdda5c73d4f8a7fd139204f2 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 22 May 2014 21:47:52 +0000 Subject: [PATCH] LUCENE-5675: zig-zag encode the versions (loses 1 bit); check the min/max version git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1596974 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/FreqProxTermsWriter.java | 3 +- .../idversion/IDVersionPostingsFormat.java | 11 ++++ .../idversion/IDVersionPostingsReader.java | 7 ++- .../idversion/IDVersionPostingsWriter.java | 20 +++++-- .../TestIDVersionPostingsFormat.java | 60 ++++++++++++++++--- 5 files changed, 87 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index fc3d777dd8f..a060ee00ea8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -58,6 +58,7 @@ final class FreqProxTermsWriter extends TermsHash { if (termsEnum != null && termsEnum.seekExact(deleteTerm.bytes())) { docsEnum = termsEnum.docs(null, docsEnum, 0); int delDocLimit = segDeletes.get(deleteTerm); + assert delDocLimit < DocsEnum.NO_MORE_DOCS; while (true) { int doc = docsEnum.nextDoc(); if (doc < delDocLimit) { @@ -96,7 +97,7 @@ final class FreqProxTermsWriter extends TermsHash { // Sort by field name CollectionUtil.introSort(allFields); - FreqProxFields fields = new FreqProxFields(allFields); + Fields fields = new FreqProxFields(allFields); applyDeletes(state, fields); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java index ec54677636e..0ddfdb086f2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java @@ -58,6 +58,14 @@ import org.apache.lucene.util.IOUtils; public class IDVersionPostingsFormat extends PostingsFormat { + /** version must be >= this. */ + public static final long MIN_VERSION = 0; + + // TODO: we could delta encode instead, and keep the last bit: + + /** version must be <= this, because we encode with ZigZag. */ + public static final long MAX_VERSION = 0x3fffffffffffffffL; + private final int minTermsInBlock; private final int maxTermsInBlock; @@ -121,6 +129,9 @@ public class IDVersionPostingsFormat extends PostingsFormat { } public static void longToBytes(long v, BytesRef bytes) { + if (v > MAX_VERSION || v < MIN_VERSION) { + throw new IllegalArgumentException("version must be >= MIN_VERSION=" + MIN_VERSION + " and <= MAX_VERSION=" + MAX_VERSION + " (got: " + v + ")"); + } bytes.offset = 0; bytes.length = 8; bytes.bytes[0] = (byte) (v >> 56); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java index 3a1ba6c6934..54887c5610a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.Bits; final class IDVersionPostingsReader extends PostingsReaderBase { @@ -54,7 +55,11 @@ final class IDVersionPostingsReader extends PostingsReaderBase { throws IOException { final IDVersionTermState termState = (IDVersionTermState) _termState; termState.docID = in.readVInt(); - termState.idVersion = in.readVLong(); + if (absolute) { + termState.idVersion = in.readVLong(); + } else { + termState.idVersion += BitUtil.zigZagDecode(in.readVLong()); + } } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index 4184069f9b7..9dc44143cb6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; final class IDVersionPostingsWriter extends PushPostingsWriterBase { @@ -115,8 +116,11 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { } lastVersion = IDVersionPostingsFormat.bytesToLong(payload); - if (lastVersion < 0) { - throw new IllegalArgumentException("version must be >= 0 (got: " + lastVersion + "; payload=" + payload + ")"); + if (lastVersion < IDVersionPostingsFormat.MIN_VERSION) { + throw new IllegalArgumentException("version must be >= MIN_VERSION=" + IDVersionPostingsFormat.MIN_VERSION + " (got: " + lastVersion + "; payload=" + payload + ")"); + } + if (lastVersion > IDVersionPostingsFormat.MAX_VERSION) { + throw new IllegalArgumentException("version must be <= MAX_VERSION=" + IDVersionPostingsFormat.MAX_VERSION + " (got: " + lastVersion + "; payload=" + payload + ")"); } } @@ -143,12 +147,20 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { state.docID = lastDocID; state.idVersion = lastVersion; } - + + private long lastEncodedVersion; + @Override public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IDVersionTermState state = (IDVersionTermState) _state; out.writeVInt(state.docID); - out.writeVLong(state.idVersion); + if (absolute) { + out.writeVLong(state.idVersion); + } else { + long delta = state.idVersion - lastEncodedVersion; + out.writeVLong(BitUtil.zigZagEncode(delta)); + } + lastEncodedVersion = state.idVersion; } @Override diff --git a/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java index 1f8f7d2d664..9ff8acef959 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java @@ -29,7 +29,11 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicLong; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.codecs.idversion.StringAndPayloadField.SingleTokenWithPayloadTokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -155,7 +159,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { int upto; @Override public String next() { - return Long.toString(random().nextLong() & 0x7ffffffffffffffL, radix); + return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix); } }; break; @@ -170,7 +174,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { int upto; @Override public String next() { - return Long.toString(random().nextLong() & 0x7ffffffffffffffL, radix); + return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix); } }; break; @@ -225,7 +229,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { if (useMonotonicVersion) { version += TestUtil.nextInt(random(), 1, 10); } else { - version = random().nextLong() & 0x7fffffffffffffffL; + version = random().nextLong() & 0x3fffffffffffffffL; } idValues.put(idValue, version); if (VERBOSE) { @@ -243,7 +247,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { if (useMonotonicVersion) { version += TestUtil.nextInt(random(), 1, 10); } else { - version = random().nextLong() & 0x7fffffffffffffffL; + version = random().nextLong() & 0x3fffffffffffffffL; } doc = new Document(); doc.add(makeIDField(idValue, version)); @@ -362,8 +366,8 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { w.addDocument(doc); doc = new Document(); doc.add(makeIDField("id", 17)); - w.addDocument(doc); try { + w.addDocument(doc); w.commit(); fail("didn't hit expected exception"); } catch (IllegalArgumentException iae) { @@ -445,7 +449,18 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { public void testMissingPayload() throws Exception { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + + // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload! + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100); + tokenizer.setEnableChecks(true); + MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET); + return new TokenStreamComponents(tokenizer, filt); + } + }; + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, a); iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); @@ -567,6 +582,36 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { dir.close(); } + public void testInvalidVersions() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + Document doc = new Document(); + // -1 + doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff}))); + try { + w.addDocument(doc); + w.commit(); + fail("didn't hit expected exception"); + } catch (IllegalArgumentException iae) { + // expected + } + + doc = new Document(); + // Long.MAX_VALUE: + doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0x7f, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff}))); + try { + w.addDocument(doc); + w.commit(); + fail("didn't hit expected exception"); + } catch (IllegalArgumentException iae) { + // expected + } + w.close(); + dir.close(); + } + // Simulates optimistic concurrency in a distributed indexing app and confirms the latest version always wins: public void testGlobalVersions() throws Exception { Directory dir = newDirectory(); @@ -576,7 +621,6 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { IDSource idsSource = getRandomIDs(); int numIDs = atLeast(100); - System.out.println("ids=" + numIDs); if (VERBOSE) { System.out.println("TEST: " + numIDs + " ids"); } @@ -649,7 +693,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { long newVersion; if (versionType == 0) { // Random: - newVersion = random().nextLong() & 0x7fffffffffffffffL; + newVersion = random().nextLong() & 0x3fffffffffffffffL; } else if (versionType == 1) { // Monotonic newVersion = nextVersion.getAndIncrement();