diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index d25ef234599..8b6fd4e46d0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -80,6 +80,7 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { lastDocID = docID; lastPosition = -1; + lastVersion = -1; } @Override @@ -94,7 +95,7 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { if (payload.length != 8) { throw new IllegalArgumentException("payload.length != 8 (got " + payload.length + ")"); } - + lastVersion = IDVersionPostingsFormat.bytesToLong(payload); if (lastVersion < 0) { throw new IllegalArgumentException("version must be >= 0 (got: " + lastVersion + "; payload=" + payload + ")"); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java index fffbffbbf69..86e4158c500 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java @@ -230,15 +230,13 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { return ((IDVersionTermState) currentFrame.state).idVersion; } - /** Returns false if the term deos not exist, or it exists but its version is too old (< minIDVersion). */ + /** Returns false if the term does not exist, or it exists but its version is too old (< minIDVersion). */ public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException { if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } - // nocommit would be nice if somehow on doing deletes we didn't have to double-lookup again... - if (term.bytes.length <= target.length) { term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); } @@ -260,7 +258,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { boolean changed = false; - // nocommit we could stop earlier w/ the version check, every time we traverse an index arc we can check? + // TODO: we could stop earlier w/ the version check, every time we traverse an index arc we can check? if (currentFrame != staticFrame) { @@ -380,7 +378,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { } return false; } - System.out.println(" term version=" + ((IDVersionTermState) currentFrame.state).idVersion + " frame version=" + currentFrame.maxIDVersion + " frame ord=" + currentFrame.ord); + // System.out.println(" term version=" + ((IDVersionTermState) currentFrame.state).idVersion + " frame version=" + currentFrame.maxIDVersion + " frame ord=" + currentFrame.ord); if (DEBUG) { System.out.println(" target is same as current; return true"); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java index 18ff0236b55..cbf73841719 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -220,11 +220,10 @@ final class IDVersionSegmentTermsEnumFrame { } void rewind() { + // Force reload: fp = fpOrig; nextEnt = -1; - // nocommit move to BT too? - //state.termBlockOrd = 0; hasTerms = hasTermsOrig; if (isFloor) { floorDataReader.rewind(); @@ -390,8 +389,7 @@ final class IDVersionSegmentTermsEnumFrame { public void decodeMetaData() throws IOException { - //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); - System.out.println("\nBTTR.decodeMetadata seg=" + ste.fr.parent.segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + ste.fr.parent.segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); assert nextEnt >= 0; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index 6417f5a6bfb..6ce8e5c9cd4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -136,9 +136,7 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { } // verify - if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { - CodecUtil.checksumEntireFile(indexIn); - } + CodecUtil.checksumEntireFile(indexIn); // Have PostingsReader init itself postingsReader.init(in); @@ -167,15 +165,10 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumDocFreq = in.readVLong(); final int docCount = in.readVInt(); - final int longsSize = version >= VersionBlockTreeTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0; + final int longsSize = in.readVInt(); - BytesRef minTerm, maxTerm; - if (version >= VersionBlockTreeTermsWriter.VERSION_MIN_MAX_TERMS) { - minTerm = readBytesRef(in); - maxTerm = readBytesRef(in); - } else { - minTerm = maxTerm = null; - } + BytesRef minTerm = readBytesRef(in); + BytesRef maxTerm = readBytesRef(in); if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")"); } @@ -217,9 +210,6 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_CODEC_NAME, VersionBlockTreeTermsWriter.VERSION_START, VersionBlockTreeTermsWriter.VERSION_CURRENT); - if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { - dirOffset = input.readLong(); - } return version; } @@ -228,22 +218,14 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, VersionBlockTreeTermsWriter.VERSION_START, VersionBlockTreeTermsWriter.VERSION_CURRENT); - if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { - indexDirOffset = input.readLong(); - } return version; } /** Seek {@code input} to the directory offset. */ private void seekDir(IndexInput input, long dirOffset) throws IOException { - if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { - input.seek(input.length() - CodecUtil.footerLength() - 8); - dirOffset = input.readLong(); - } else if (version >= VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { - input.seek(input.length() - 8); - dirOffset = input.readLong(); - } + input.seek(input.length() - CodecUtil.footerLength() - 8); + dirOffset = input.readLong(); input.seek(dirOffset); } @@ -306,12 +288,10 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { @Override public void checkIntegrity() throws IOException { - if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { - // term dictionary - CodecUtil.checksumEntireFile(in); + // term dictionary + CodecUtil.checksumEntireFile(in); - // postings - postingsReader.checkIntegrity(); - } + // postings + postingsReader.checkIntegrity(); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index b1e66491e36..9c515319bb1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -84,118 +84,19 @@ import org.apache.lucene.util.packed.PackedInts; */ /** - * Block-based terms index and dictionary writer. - *

- * Writes terms dict and index, block-encoding (column - * stride) each term's metadata for each set of terms - * between two index terms. - *

- * Files: - *

- *

- * - *

Term Dictionary

+ * This is just like {@link BlockTreeTermsWriter}, except it also stores a version per term, and adds a method to its TermsEnum + * implementation to seekExact only if the version is >= the specified version. The version is added to the terms index to avoid seeking if + * no term in the block has a high enough version. The term blocks file is .tiv and the terms index extension is .tipv. * - *

The .tim file contains the list of terms in each - * field along with per-term statistics (such as docfreq) - * and per-term metadata (typically pointers to the postings list - * for that term in the inverted index). - *

- * - *

The .tim is arranged in blocks: with blocks containing - * a variable number of entries (by default 25-48), where - * each entry is either a term or a reference to a - * sub-block.

- * - *

NOTE: The term dictionary can plug into different postings implementations: - * the postings writer/reader are actually responsible for encoding - * and decoding the Postings Metadata and Term Metadata sections.

- * - * - *

Notes:

- * - * - *

Term Index

- *

The .tip file contains an index into the term dictionary, so that it can be - * accessed randomly. The index is also used to determine - * when a given term cannot exist on disk (in the .tim file), saving a disk seek.

- * - *

Notes:

- * - * - * @see BlockTreeTermsReader * @lucene.experimental */ -// nocommit fix jdocs + final class VersionBlockTreeTermsWriter extends FieldsConsumer { + private static boolean DEBUG = IDVersionSegmentTermsEnum.DEBUG; + static final PairOutputs FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(), - PositiveIntOutputs.getSingleton()); + PositiveIntOutputs.getSingleton()); static final Pair NO_OUTPUT = FST_OUTPUTS.getNoOutput(); @@ -224,25 +125,11 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { /** Initial terms format. */ public static final int VERSION_START = 0; - // nocommit nuke all these old versions - - /** Append-only */ - public static final int VERSION_APPEND_ONLY = 1; - - /** Meta data as array */ - public static final int VERSION_META_ARRAY = 2; - - /** checksums */ - public static final int VERSION_CHECKSUM = 3; - - /** min/max term */ - public static final int VERSION_MIN_MAX_TERMS = 4; - /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS; + public static final int VERSION_CURRENT = VERSION_START; /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tip"; + static final String TERMS_INDEX_EXTENSION = "tipv"; final static String TERMS_INDEX_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_INDEX"; private final IndexOutput out; @@ -297,7 +184,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { int maxItemsInBlock) throws IOException { - System.out.println("VBTTW minItemsInBlock=" + minItemsInBlock + " maxItemsInBlock=" + maxItemsInBlock); if (minItemsInBlock <= 1) { throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); } @@ -626,8 +512,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { // following floor blocks: void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException { - // nocommit why can't we do floor blocks for root frame? - if (prefixLength == 0 || count <= maxItemsInBlock) { + if (count <= maxItemsInBlock) { // Easy case: not floor block. Eg, prefix is "foo", // and we found 30 terms/sub-blocks starting w/ that // prefix, and minItemsInBlock <= 30 <= @@ -645,7 +530,7 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { // TODO: we could store min & max suffix start byte // in each block, to make floor blocks authoritative - //if (DEBUG) { + if (DEBUG) { final BytesRef prefix = new BytesRef(prefixLength); for(int m=0;m " + version); + } + w.updateDocument(new Term("id", idValue), doc); + idValues.put(idValue, version); + } else { + if (VERBOSE) { + System.out.println(" delete " + idValue); + } + w.deleteDocuments(new Term("id", idValue)); + idValues.remove(idValue); + } + } + docUpto++; } - //IndexReader r = w.getReader(); - IndexReader r = DirectoryReader.open(w, true); + IndexReader r = w.getReader(); + //IndexReader r = DirectoryReader.open(w, true); PerThreadVersionPKLookup lookup = new PerThreadVersionPKLookup(r, "id"); List> idValuesList = new ArrayList<>(idValues.entrySet()); @@ -242,7 +271,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { String idValue; if (random().nextBoolean()) { - idValue = idValuesList.get(random().nextInt(numDocs)).getKey(); + idValue = idValuesList.get(random().nextInt(idValuesList.size())).getKey(); } else if (random().nextBoolean()) { idValue = ids.next(); } else { @@ -318,14 +347,6 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { payload.length = 8; IDVersionPostingsFormat.longToBytes(version, payload); return new StringAndPayloadField("id", id, payload); - - /* - Field field = newTextField("id", "", Field.Store.NO); - Token token = new Token(id, 0, id.length()); - token.setPayload(payload); - field.setTokenStream(new CannedTokenStream(token)); - return field; - */ } public void testMoreThanOneDocPerIDOneSegment() throws Exception { @@ -353,6 +374,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); + iwc.setMergePolicy(new TieredMergePolicy()); MergeScheduler ms = iwc.getMergeScheduler(); if (ms instanceof ConcurrentMergeScheduler) { iwc.setMergeScheduler(new ConcurrentMergeScheduler() { @@ -362,7 +384,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { } }); } - RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(makeIDField("id", 17)); w.addDocument(doc); @@ -380,7 +402,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { // expected assertTrue(ioe.getCause() instanceof IllegalArgumentException); } - w.w.close(); + w.close(); dir.close(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java index f656450de4e..8cd194f1141 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java @@ -37,7 +37,7 @@ import org.apache.lucene.util.Bits; * time. * @lucene.experimental */ -// nocommit mv under blocktree? but ... it's used by others (e.g. block terms) +// TODO: maybe move under blocktree? but it's used by other terms dicts (e.g. Block) // TODO: find a better name; this defines the API that the // terms dict impls use to talk to a postings impl. diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 10682263269..b9a995fa632 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1640,22 +1640,15 @@ public class CheckIndex { // Again, with the one doc deleted: checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose); - // Only agg stats if the doc is live: - final boolean doStats = liveDocs == null || liveDocs.get(j); - - if (doStats == false) { - // nocommit is it OK to stop verifying deleted docs? + if (liveDocs != null && liveDocs.get(j) == false) { + // Only check live docs continue; } - if (doStats) { - status.docCount++; - } + status.docCount++; for(String field : tfv) { - if (doStats) { - status.totVectors++; - } + status.totVectors++; // Make sure FieldInfo thinks this field is vector'd: final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index 89f54420898..4d97749937e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -94,6 +94,8 @@ final class DefaultIndexingChain extends DocConsumer { // aborting on any exception from this method int numDocs = state.segmentInfo.getDocCount(); + + // TODO: we could set liveDocs earlier and then fix DVs to also not write deleted docs: writeNorms(state); writeDocValues(state); diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index edd519386d1..5ba48d272bf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -34,7 +34,6 @@ final class FreqProxTermsWriter extends TermsHash { } private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException { - System.out.println("applyDeletes segUpdates=" + state.segUpdates); // Process any pending Term deletes for this newly // flushed segment: @@ -108,8 +107,6 @@ final class FreqProxTermsWriter extends TermsHash { fields.setLiveDocs(state.liveDocs); } - System.out.println("now: " + state.liveDocs + " pf=" + state.segmentInfo.getCodec().postingsFormat()); - FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state); boolean success = false; try { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 6cde6692013..9f99287af8c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1696,7 +1696,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit{ if (doWait) { synchronized(this) { while(true) { - if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot complete forceMerge"); } diff --git a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java index c3195fcb738..e660ab2d331 100644 --- a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java +++ b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java @@ -17,8 +17,7 @@ package org.apache.lucene.uninverting; * limitations under the License. */ -import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; - +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -47,14 +46,17 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; +import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; + public class TestFieldCacheVsDocValues extends LuceneTestCase { public void testByteMissingVsFieldCache() throws Exception { @@ -315,14 +317,11 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { } // delete some docs - // nocommit hmmm what to do - /* int numDeletions = random().nextInt(numDocs/10); for (int i = 0; i < numDeletions; i++) { int id = random().nextInt(numDocs); writer.deleteDocuments(new Term("id", Integer.toString(id))); } - */ writer.shutdown(); // compare @@ -331,7 +330,7 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { AtomicReader r = context.reader(); SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed"); SortedDocValues actual = r.getSortedDocValues("dv"); - assertEquals(r.maxDoc(), expected, actual); + assertEquals(r.maxDoc(), r.getLiveDocs(), expected, actual); } ir.close(); dir.close(); @@ -382,14 +381,11 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { } // delete some docs - // nocommit hmmm what to do - /* int numDeletions = random().nextInt(numDocs/10); for (int i = 0; i < numDeletions; i++) { int id = random().nextInt(numDocs); writer.deleteDocuments(new Term("id", Integer.toString(id))); } - */ // compare per-segment DirectoryReader ir = writer.getReader(); @@ -397,7 +393,7 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { AtomicReader r = context.reader(); SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(r, "indexed", null); SortedSetDocValues actual = r.getSortedSetDocValues("dv"); - assertEquals(r.maxDoc(), expected, actual); + assertEquals(r.maxDoc(), r.getLiveDocs(), expected, actual); } ir.close(); @@ -408,7 +404,7 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { AtomicReader ar = getOnlySegmentReader(ir); SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(ar, "indexed", null); SortedSetDocValues actual = ar.getSortedSetDocValues("dv"); - assertEquals(ir.maxDoc(), expected, actual); + assertEquals(ir.maxDoc(), ar.getLiveDocs(), expected, actual); ir.close(); writer.shutdown(); @@ -449,14 +445,11 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { } // delete some docs - // nocommit hmmm what to do - /* int numDeletions = random().nextInt(numDocs/10); for (int i = 0; i < numDeletions; i++) { int id = random().nextInt(numDocs); writer.deleteDocuments(new Term("id", Integer.toString(id))); } - */ // merge some segments and ensure that at least one of them has more than // 256 values @@ -496,102 +489,149 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { } } - private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception { - assertEquals(maxDoc, DocValues.singleton(expected), DocValues.singleton(actual)); + private void assertEquals(int maxDoc, Bits liveDocs, SortedDocValues expected, SortedDocValues actual) throws Exception { + assertEquals(maxDoc, liveDocs, DocValues.singleton(expected), DocValues.singleton(actual)); } - private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception { + private void assertEquals(int maxDoc, Bits liveDocs, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception { // can be null for the segment if no docs actually had any SortedDocValues // in this case FC.getDocTermsOrds returns EMPTY if (actual == null) { assertEquals(DocValues.EMPTY_SORTED_SET, expected); return; } - assertEquals(expected.getValueCount(), actual.getValueCount()); - // compare ord lists + + FixedBitSet liveOrdsExpected = new FixedBitSet((int) expected.getValueCount()); + FixedBitSet liveOrdsActual = new FixedBitSet((int) actual.getValueCount()); + + BytesRef expectedBytes = new BytesRef(); + BytesRef actualBytes = new BytesRef(); + + // compare values for all live docs: for (int i = 0; i < maxDoc; i++) { + if (liveDocs != null && liveDocs.get(i) == false) { + // Don't check deleted docs + continue; + } expected.setDocument(i); actual.setDocument(i); long expectedOrd; while ((expectedOrd = expected.nextOrd()) != NO_MORE_ORDS) { - assertEquals(expectedOrd, actual.nextOrd()); + expected.lookupOrd(expectedOrd, expectedBytes); + long actualOrd = actual.nextOrd(); + assertTrue(actualOrd != NO_MORE_ORDS); + actual.lookupOrd(actualOrd, actualBytes); + assertEquals(expectedBytes, actualBytes); + liveOrdsExpected.set((int) expectedOrd); + liveOrdsActual.set((int) actualOrd); } + assertEquals(NO_MORE_ORDS, actual.nextOrd()); } + + // Make sure both have same number of non-deleted values: + assertEquals(liveOrdsExpected.cardinality(), liveOrdsActual.cardinality()); // compare ord dictionary - BytesRef expectedBytes = new BytesRef(); - BytesRef actualBytes = new BytesRef(); - for (long i = 0; i < expected.getValueCount(); i++) { - expected.lookupTerm(expectedBytes); - actual.lookupTerm(actualBytes); + int expectedOrd = 0; + int actualOrd = 0; + while (expectedOrd < expected.getValueCount()) { + expectedOrd = liveOrdsExpected.nextSetBit(expectedOrd); + if (expectedOrd == -1) { + break; + } + actualOrd = liveOrdsActual.nextSetBit(actualOrd); + expected.lookupOrd(expectedOrd, expectedBytes); + actual.lookupOrd(actualOrd, actualBytes); assertEquals(expectedBytes, actualBytes); + expectedOrd++; + actualOrd++; } + assertTrue(actualOrd == actual.getValueCount() || liveOrdsActual.nextSetBit(actualOrd) == -1); // compare termsenum - assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum()); + assertEquals(expected.getValueCount(), expected.termsEnum(), liveOrdsExpected, actual.termsEnum(), liveOrdsActual); } - - private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception { + + /** Does termsEnum.next() but then skips over deleted ords. */ + private static BytesRef next(TermsEnum termsEnum, Bits liveOrds) throws IOException { + while (termsEnum.next() != null) { + if (liveOrds.get((int) termsEnum.ord())) { + return termsEnum.term(); + } + } + return null; + } + + /** Does termsEnum.seekCeil() but then skips over deleted ords. */ + private static SeekStatus seekCeil(TermsEnum termsEnum, BytesRef term, Bits liveOrds) throws IOException { + SeekStatus status = termsEnum.seekCeil(term); + if (status == SeekStatus.END) { + return status; + } else { + if (liveOrds.get((int) termsEnum.ord()) == false) { + while (termsEnum.next() != null) { + if (liveOrds.get((int) termsEnum.ord())) { + return SeekStatus.NOT_FOUND; + } + } + return SeekStatus.END; + } else { + return status; + } + } + } + + private void assertEquals(long numOrds, TermsEnum expected, Bits liveOrdsExpected, TermsEnum actual, Bits liveOrdsActual) throws Exception { BytesRef ref; // sequential next() through all terms - while ((ref = expected.next()) != null) { - assertEquals(ref, actual.next()); - assertEquals(expected.ord(), actual.ord()); - assertEquals(expected.term(), actual.term()); - } - assertNull(actual.next()); - - // sequential seekExact(ord) through all terms - for (long i = 0; i < numOrds; i++) { - expected.seekExact(i); - actual.seekExact(i); - assertEquals(expected.ord(), actual.ord()); + while ((ref = next(expected, liveOrdsExpected)) != null) { + assertEquals(ref, next(actual, liveOrdsActual)); assertEquals(expected.term(), actual.term()); } + assertNull(next(actual, liveOrdsActual)); // sequential seekExact(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { + if (liveOrdsExpected.get((int) i) == false) { + continue; + } expected.seekExact(i); assertTrue(actual.seekExact(expected.term())); - assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // sequential seekCeil(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { + if (liveOrdsExpected.get((int) i) == false) { + continue; + } expected.seekExact(i); assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term())); - assertEquals(expected.ord(), actual.ord()); - assertEquals(expected.term(), actual.term()); - } - - // random seekExact(ord) - for (long i = 0; i < numOrds; i++) { - long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); - expected.seekExact(randomOrd); - actual.seekExact(randomOrd); - assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // random seekExact(BytesRef) for (long i = 0; i < numOrds; i++) { long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); + if (liveOrdsExpected.get((int) randomOrd) == false) { + continue; + } expected.seekExact(randomOrd); actual.seekExact(expected.term()); - assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // random seekCeil(BytesRef) for (long i = 0; i < numOrds; i++) { + if (liveOrdsExpected.get((int) i) == false) { + continue; + } BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random())); - SeekStatus expectedStatus = expected.seekCeil(target); - assertEquals(expectedStatus, actual.seekCeil(target)); + SeekStatus expectedStatus = seekCeil(expected, target, liveOrdsExpected); + assertEquals(expectedStatus, seekCeil(actual, target, liveOrdsActual)); if (expectedStatus != SeekStatus.END) { - assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } }