diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java index 96fccc9ee84..0078a927b62 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java @@ -59,7 +59,7 @@ public class IDVersionPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(); + PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(state); boolean success = false; try { FieldsConsumer ret = new VersionBlockTreeTermsWriter(state, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index 8b6fd4e46d0..50055725888 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PushPostingsWriterBase; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TermState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; @@ -40,10 +41,16 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { final static IDVersionTermState emptyState = new IDVersionTermState(); IDVersionTermState lastState; - private int lastDocID; + int lastDocID; private int lastPosition; private long lastVersion; + private final SegmentWriteState state; + + public IDVersionPostingsWriter(SegmentWriteState state) { + this.state = state; + } + @Override public IDVersionTermState newTermState() { return new IDVersionTermState(); @@ -71,6 +78,9 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void startDoc(int docID, int termDocFreq) throws IOException { + if (state.liveDocs != null && state.liveDocs.get(docID) == false) { + return; + } if (lastDocID != -1) { throw new IllegalArgumentException("term appears in more than one document"); } @@ -85,6 +95,10 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + if (lastDocID == -1) { + // Doc is deleted; skip it + return; + } if (lastPosition != -1) { throw new IllegalArgumentException("term appears more than once in document"); } @@ -104,6 +118,10 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void finishDoc() throws IOException { + if (lastDocID == -1) { + // Doc is deleted; skip it + return; + } if (lastPosition == -1) { throw new IllegalArgumentException("missing addPosition"); } @@ -112,10 +130,12 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { /** Called when we are done adding docs to this term */ @Override public void finishTerm(BlockTermState _state) throws IOException { + if (lastDocID == -1) { + return; + } IDVersionTermState state = (IDVersionTermState) _state; assert state.docFreq > 0; - assert lastDocID != -1; state.docID = lastDocID; state.idVersion = lastVersion; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java index 86e4158c500..0f0c751f39a 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java @@ -985,17 +985,13 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { @Override public int docFreq() throws IOException { assert !eof; - //if (DEBUG) System.out.println("BTR.docFreq"); - currentFrame.decodeMetaData(); - //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); - return currentFrame.state.docFreq; + return 1; } @Override public long totalTermFreq() throws IOException { assert !eof; - currentFrame.decodeMetaData(); - return currentFrame.state.totalTermFreq; + return 1; } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java index cbf73841719..17ac5eec804 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -51,9 +51,6 @@ final class IDVersionSegmentTermsEnumFrame { byte[] suffixBytes = new byte[128]; final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); - byte[] statBytes = new byte[64]; - final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - byte[] floorData = new byte[32]; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); @@ -184,13 +181,6 @@ final class IDVersionSegmentTermsEnumFrame { } }*/ - // stats - numBytes = ste.in.readVInt(); - if (statBytes.length < numBytes) { - statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - ste.in.readBytes(statBytes, 0, numBytes); - statsReader.reset(statBytes, 0, numBytes); metaDataUpto = 0; state.termBlockOrd = 0; @@ -210,7 +200,6 @@ final class IDVersionSegmentTermsEnumFrame { ste.in.readBytes(bytes, 0, numBytes); bytesReader.reset(bytes, 0, numBytes); - // Sub-blocks of a single floor block are always // written one after another -- tail recurse: fpEnd = ste.in.getFilePointer(); @@ -410,12 +399,9 @@ final class IDVersionSegmentTermsEnumFrame { // just skipN here: // stats - state.docFreq = statsReader.readVInt(); + state.docFreq = 1; + state.totalTermFreq = 1; //if (DEBUG) System.out.println(" dF=" + state.docFreq); - if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - state.totalTermFreq = state.docFreq + statsReader.readVLong(); - //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); - } // metadata for (int i = 0; i < ste.fr.longsSize; i++) { longs[i] = bytesReader.readVLong(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index 6ce8e5c9cd4..63a47eccf30 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -162,9 +162,10 @@ final class VersionBlockTreeTermsReader extends FieldsProducer { final Pair rootCode = VersionBlockTreeTermsWriter.FST_OUTPUTS.newPair(code, version); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); assert fieldInfo != null: "field=" + field; - final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); - final long sumDocFreq = in.readVLong(); - final int docCount = in.readVInt(); + final long sumTotalTermFreq = numTerms; + final long sumDocFreq = numTerms; + assert numTerms <= Integer.MAX_VALUE; + final int docCount = (int) numTerms; final int longsSize = in.readVInt(); BytesRef minTerm = readBytesRef(in); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index 9c515319bb1..59f44ed1cc8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -53,6 +53,10 @@ import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.PackedInts; +// nocommit break out the "don't write del docs on flush" + +// nocommit don't write/read stats + /* TODO: @@ -146,14 +150,11 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { public final Pair rootCode; public final long numTerms; public final long indexStartFP; - public final long sumTotalTermFreq; - public final long sumDocFreq; - public final int docCount; private final int longsSize; public final BytesRef minTerm; public final BytesRef maxTerm; - public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, + public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, int longsSize, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; @@ -161,9 +162,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.rootCode = rootCode; this.indexStartFP = indexStartFP; this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; @@ -207,13 +205,13 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { fieldInfos = state.fieldInfos; this.minItemsInBlock = minItemsInBlock; this.maxItemsInBlock = maxItemsInBlock; - writeHeader(out); + CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT); //DEBUG = state.segmentName.equals("_4a"); final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); indexOut = state.directory.createOutput(termsIndexFileName, state.context); - writeIndexHeader(indexOut); + CodecUtil.writeHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT); this.postingsWriter = postingsWriter; segment = state.segmentInfo.name; @@ -230,16 +228,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.indexOut = indexOut; } - /** Writes the terms file header. */ - private void writeHeader(IndexOutput out) throws IOException { - CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT); - } - - /** Writes the index file header. */ - private void writeIndexHeader(IndexOutput out) throws IOException { - CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT); - } - /** Writes the terms file trailer. */ private void writeTrailer(IndexOutput out, long dirStart) throws IOException { out.writeLong(dirStart); @@ -434,8 +422,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { private final int longsSize; private long numTerms; final FixedBitSet docsSeen; - long sumTotalTermFreq; - long sumDocFreq; long indexStartFP; // Used only to partition terms into the block tree; we @@ -815,13 +801,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { suffixWriter.writeVInt(suffix); suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix); - // Write term stats, to separate byte[] blob: - statsWriter.writeVInt(state.docFreq); - if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq; - statsWriter.writeVLong(state.totalTermFreq - state.docFreq); - } - // Write term meta data postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); for (int pos = 0; pos < longsSize; pos++) { @@ -853,13 +832,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { suffixWriter.writeVInt(suffix<<1); suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix); - // Write term stats, to separate byte[] blob: - statsWriter.writeVInt(state.docFreq); - if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - assert state.totalTermFreq >= state.docFreq; - statsWriter.writeVLong(state.totalTermFreq - state.docFreq); - } - // TODO: now that terms dict "sees" these longs, // we can explore better column-stride encodings // to encode all long[0]s for this block at @@ -916,11 +888,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { suffixWriter.writeTo(out); suffixWriter.reset(); - // Write term stats byte[] blob - out.writeVInt((int) statsWriter.getFilePointer()); - statsWriter.writeTo(out); - statsWriter.reset(); - // Write term meta data byte[] blob out.writeVInt((int) metaWriter.getFilePointer()); metaWriter.writeTo(out); @@ -970,11 +937,9 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { public void write(BytesRef text, TermsEnum termsEnum) throws IOException { BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen); - if (state != null) { + if (state != null && ((IDVersionPostingsWriter) postingsWriter).lastDocID != -1) { assert state.docFreq != 0; assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter; - sumDocFreq += state.docFreq; - sumTotalTermFreq += state.totalTermFreq; blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state); @@ -1011,14 +976,9 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { ((PendingBlock) pending.get(0)).index.getEmptyOutput(), numTerms, indexStartFP, - sumTotalTermFreq, - sumDocFreq, - docsSeen.cardinality(), longsSize, minTerm, maxTerm)); } else { - assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; - assert sumDocFreq == 0; assert docsSeen.cardinality() == 0; } } @@ -1048,11 +1008,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer { out.writeVInt(field.rootCode.output1.length); out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length); out.writeVLong(field.rootCode.output2); - if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - out.writeVLong(field.sumTotalTermFreq); - } - out.writeVLong(field.sumDocFreq); - out.writeVInt(field.docCount); out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(out, field.minTerm); diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java index f9ebaf3e481..d3f71be6313 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java @@ -37,8 +37,6 @@ import org.apache.lucene.util.BytesRef; class FreqProxFields extends Fields { final Map fields = new LinkedHashMap<>(); - private Bits liveDocs; - public FreqProxFields(List fieldList) { // NOTE: fields are already sorted by field name for(FreqProxTermsWriterPerField field : fieldList) { @@ -46,10 +44,6 @@ class FreqProxFields extends Fields { } } - public void setLiveDocs(Bits liveDocs) { - this.liveDocs = liveDocs; - } - public Iterator iterator() { return fields.keySet().iterator(); } @@ -57,7 +51,7 @@ class FreqProxFields extends Fields { @Override public Terms terms(String field) throws IOException { FreqProxTermsWriterPerField perField = fields.get(field); - return perField == null ? null : new FreqProxTerms(perField, liveDocs); + return perField == null ? null : new FreqProxTerms(perField); } @Override @@ -68,11 +62,9 @@ class FreqProxFields extends Fields { private static class FreqProxTerms extends Terms { final FreqProxTermsWriterPerField terms; - final Bits liveDocs; - public FreqProxTerms(FreqProxTermsWriterPerField terms, Bits liveDocs) { + public FreqProxTerms(FreqProxTermsWriterPerField terms) { this.terms = terms; - this.liveDocs = liveDocs; } @Override @@ -80,9 +72,8 @@ class FreqProxFields extends Fields { FreqProxTermsEnum termsEnum; if (reuse instanceof FreqProxTermsEnum && ((FreqProxTermsEnum) reuse).terms == this.terms) { termsEnum = (FreqProxTermsEnum) reuse; - assert termsEnum.liveDocs == this.liveDocs; } else { - termsEnum = new FreqProxTermsEnum(terms, liveDocs); + termsEnum = new FreqProxTermsEnum(terms); } termsEnum.reset(); return termsEnum; @@ -145,13 +136,11 @@ class FreqProxFields extends Fields { final FreqProxPostingsArray postingsArray; final BytesRef scratch = new BytesRef(); final int numTerms; - final Bits liveDocs; int ord; - public FreqProxTermsEnum(FreqProxTermsWriterPerField terms, Bits liveDocs) { + public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) { this.terms = terms; this.numTerms = terms.bytesHash.size(); - this.liveDocs = liveDocs; sortedTermIDs = terms.sortedTermIDs; assert sortedTermIDs != null; postingsArray = (FreqProxPostingsArray) terms.postingsArray; @@ -239,8 +228,8 @@ class FreqProxFields extends Fields { } @Override - public DocsEnum docs(Bits liveDocsIn, DocsEnum reuse, int flags) { - if (liveDocsIn != null) { + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) { + if (liveDocs != null) { throw new IllegalArgumentException("liveDocs must be null"); } @@ -255,20 +244,18 @@ class FreqProxFields extends Fields { if (reuse instanceof FreqProxDocsEnum) { docsEnum = (FreqProxDocsEnum) reuse; if (docsEnum.postingsArray != postingsArray) { - docsEnum = new FreqProxDocsEnum(terms, postingsArray, liveDocs); - } else { - assert docsEnum.liveDocs == liveDocs; + docsEnum = new FreqProxDocsEnum(terms, postingsArray); } } else { - docsEnum = new FreqProxDocsEnum(terms, postingsArray, liveDocs); + docsEnum = new FreqProxDocsEnum(terms, postingsArray); } docsEnum.reset(sortedTermIDs[ord]); return docsEnum; } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocsIn, DocsAndPositionsEnum reuse, int flags) { - if (liveDocsIn != null) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { + if (liveDocs != null) { throw new IllegalArgumentException("liveDocs must be null"); } FreqProxDocsAndPositionsEnum posEnum; @@ -288,12 +275,10 @@ class FreqProxFields extends Fields { if (reuse instanceof FreqProxDocsAndPositionsEnum) { posEnum = (FreqProxDocsAndPositionsEnum) reuse; if (posEnum.postingsArray != postingsArray) { - posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray, liveDocs); - } else { - assert posEnum.liveDocs == liveDocs; + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); } } else { - posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray, liveDocs); + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); } posEnum.reset(sortedTermIDs[ord]); return posEnum; @@ -326,17 +311,15 @@ class FreqProxFields extends Fields { final FreqProxPostingsArray postingsArray; final ByteSliceReader reader = new ByteSliceReader(); final boolean readTermFreq; - final Bits liveDocs; int docID; int freq; boolean ended; int termID; - public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray, Bits liveDocs) { + public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { this.terms = terms; this.postingsArray = postingsArray; this.readTermFreq = terms.hasFreq; - this.liveDocs = liveDocs; } public void reset(int termID) { @@ -391,10 +374,6 @@ class FreqProxFields extends Fields { assert docID != postingsArray.lastDocIDs[termID]; } - if (liveDocs != null && liveDocs.get(docID) == false) { - continue; - } - return docID; } } @@ -417,7 +396,6 @@ class FreqProxFields extends Fields { final ByteSliceReader reader = new ByteSliceReader(); final ByteSliceReader posReader = new ByteSliceReader(); final boolean readOffsets; - final Bits liveDocs; int docID; int freq; int pos; @@ -429,11 +407,10 @@ class FreqProxFields extends Fields { boolean hasPayload; BytesRef payload = new BytesRef(); - public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray, Bits liveDocs) { + public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { this.terms = terms; this.postingsArray = postingsArray; this.readOffsets = terms.hasOffsets; - this.liveDocs = liveDocs; assert terms.hasProx; assert terms.hasFreq; } @@ -487,9 +464,6 @@ class FreqProxFields extends Fields { posLeft = freq; pos = 0; startOffset = 0; - if (liveDocs != null && liveDocs.get(docID) == false) { - continue; - } return docID; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index 5ba48d272bf..199388d0535 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -103,10 +103,6 @@ final class FreqProxTermsWriter extends TermsHash { applyDeletes(state, fields); - if (state.liveDocs != null) { - fields.setLiveDocs(state.liveDocs); - } - FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state); boolean success = false; try {