LUCENE-5675: go back to sending deleted docs to PostingsFormat on flush; move 'skip deleted docs' into IDVPF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1596599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-21 15:29:25 +00:00
parent 18d2cfaf9c
commit 2759adc754
8 changed files with 53 additions and 125 deletions

View File

@ -59,7 +59,7 @@ public class IDVersionPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new IDVersionPostingsWriter();
PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new VersionBlockTreeTermsWriter(state,

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
@ -40,10 +41,16 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
final static IDVersionTermState emptyState = new IDVersionTermState();
IDVersionTermState lastState;
private int lastDocID;
int lastDocID;
private int lastPosition;
private long lastVersion;
private final SegmentWriteState state;
public IDVersionPostingsWriter(SegmentWriteState state) {
this.state = state;
}
@Override
public IDVersionTermState newTermState() {
return new IDVersionTermState();
@ -71,6 +78,9 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (state.liveDocs != null && state.liveDocs.get(docID) == false) {
return;
}
if (lastDocID != -1) {
throw new IllegalArgumentException("term appears in more than one document");
}
@ -85,6 +95,10 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (lastDocID == -1) {
// Doc is deleted; skip it
return;
}
if (lastPosition != -1) {
throw new IllegalArgumentException("term appears more than once in document");
}
@ -104,6 +118,10 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
@Override
public void finishDoc() throws IOException {
if (lastDocID == -1) {
// Doc is deleted; skip it
return;
}
if (lastPosition == -1) {
throw new IllegalArgumentException("missing addPosition");
}
@ -112,10 +130,12 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(BlockTermState _state) throws IOException {
if (lastDocID == -1) {
return;
}
IDVersionTermState state = (IDVersionTermState) _state;
assert state.docFreq > 0;
assert lastDocID != -1;
state.docID = lastDocID;
state.idVersion = lastVersion;
}

View File

@ -985,17 +985,13 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum {
@Override
public int docFreq() throws IOException {
assert !eof;
//if (DEBUG) System.out.println("BTR.docFreq");
currentFrame.decodeMetaData();
//if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq);
return currentFrame.state.docFreq;
return 1;
}
@Override
public long totalTermFreq() throws IOException {
assert !eof;
currentFrame.decodeMetaData();
return currentFrame.state.totalTermFreq;
return 1;
}
@Override

View File

@ -51,9 +51,6 @@ final class IDVersionSegmentTermsEnumFrame {
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] statBytes = new byte[64];
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
@ -184,13 +181,6 @@ final class IDVersionSegmentTermsEnumFrame {
}
}*/
// stats
numBytes = ste.in.readVInt();
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(statBytes, 0, numBytes);
statsReader.reset(statBytes, 0, numBytes);
metaDataUpto = 0;
state.termBlockOrd = 0;
@ -210,7 +200,6 @@ final class IDVersionSegmentTermsEnumFrame {
ste.in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
fpEnd = ste.in.getFilePointer();
@ -410,12 +399,9 @@ final class IDVersionSegmentTermsEnumFrame {
// just skipN here:
// stats
state.docFreq = statsReader.readVInt();
state.docFreq = 1;
state.totalTermFreq = 1;
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < ste.fr.longsSize; i++) {
longs[i] = bytesReader.readVLong();

View File

@ -162,9 +162,10 @@ final class VersionBlockTreeTermsReader extends FieldsProducer {
final Pair<BytesRef,Long> rootCode = VersionBlockTreeTermsWriter.FST_OUTPUTS.newPair(code, version);
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
assert fieldInfo != null: "field=" + field;
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
final long sumTotalTermFreq = numTerms;
final long sumDocFreq = numTerms;
assert numTerms <= Integer.MAX_VALUE;
final int docCount = (int) numTerms;
final int longsSize = in.readVInt();
BytesRef minTerm = readBytesRef(in);

View File

@ -53,6 +53,10 @@ import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
// nocommit break out the "don't write del docs on flush"
// nocommit don't write/read stats
/*
TODO:
@ -146,14 +150,11 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
public final Pair<BytesRef,Long> rootCode;
public final long numTerms;
public final long indexStartFP;
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
private final int longsSize;
public final BytesRef minTerm;
public final BytesRef maxTerm;
public FieldMetaData(FieldInfo fieldInfo, Pair<BytesRef,Long> rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize,
public FieldMetaData(FieldInfo fieldInfo, Pair<BytesRef,Long> rootCode, long numTerms, long indexStartFP, int longsSize,
BytesRef minTerm, BytesRef maxTerm) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
@ -161,9 +162,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
this.rootCode = rootCode;
this.indexStartFP = indexStartFP;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.longsSize = longsSize;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
@ -207,13 +205,13 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
writeHeader(out);
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT);
//DEBUG = state.segmentName.equals("_4a");
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
writeIndexHeader(indexOut);
CodecUtil.writeHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT);
this.postingsWriter = postingsWriter;
segment = state.segmentInfo.name;
@ -230,16 +228,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
this.indexOut = indexOut;
}
/** Writes the terms file header. */
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT);
}
/** Writes the index file header. */
private void writeIndexHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT);
}
/** Writes the terms file trailer. */
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
@ -434,8 +422,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
private final int longsSize;
private long numTerms;
final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
long indexStartFP;
// Used only to partition terms into the block tree; we
@ -815,13 +801,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
suffixWriter.writeVInt(suffix);
suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
}
// Write term meta data
postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
for (int pos = 0; pos < longsSize; pos++) {
@ -853,13 +832,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
suffixWriter.writeVInt(suffix<<1);
suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
assert state.totalTermFreq >= state.docFreq;
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
}
// TODO: now that terms dict "sees" these longs,
// we can explore better column-stride encodings
// to encode all long[0]s for this block at
@ -916,11 +888,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
suffixWriter.writeTo(out);
suffixWriter.reset();
// Write term stats byte[] blob
out.writeVInt((int) statsWriter.getFilePointer());
statsWriter.writeTo(out);
statsWriter.reset();
// Write term meta data byte[] blob
out.writeVInt((int) metaWriter.getFilePointer());
metaWriter.writeTo(out);
@ -970,11 +937,9 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
if (state != null) {
if (state != null && ((IDVersionPostingsWriter) postingsWriter).lastDocID != -1) {
assert state.docFreq != 0;
assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
sumDocFreq += state.docFreq;
sumTotalTermFreq += state.totalTermFreq;
blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
@ -1011,14 +976,9 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
((PendingBlock) pending.get(0)).index.getEmptyOutput(),
numTerms,
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
docsSeen.cardinality(),
longsSize,
minTerm, maxTerm));
} else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
assert sumDocFreq == 0;
assert docsSeen.cardinality() == 0;
}
}
@ -1048,11 +1008,6 @@ final class VersionBlockTreeTermsWriter extends FieldsConsumer {
out.writeVInt(field.rootCode.output1.length);
out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length);
out.writeVLong(field.rootCode.output2);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
out.writeVInt(field.longsSize);
indexOut.writeVLong(field.indexStartFP);
writeBytesRef(out, field.minTerm);

View File

@ -37,8 +37,6 @@ import org.apache.lucene.util.BytesRef;
class FreqProxFields extends Fields {
final Map<String,FreqProxTermsWriterPerField> fields = new LinkedHashMap<>();
private Bits liveDocs;
public FreqProxFields(List<FreqProxTermsWriterPerField> fieldList) {
// NOTE: fields are already sorted by field name
for(FreqProxTermsWriterPerField field : fieldList) {
@ -46,10 +44,6 @@ class FreqProxFields extends Fields {
}
}
public void setLiveDocs(Bits liveDocs) {
this.liveDocs = liveDocs;
}
public Iterator<String> iterator() {
return fields.keySet().iterator();
}
@ -57,7 +51,7 @@ class FreqProxFields extends Fields {
@Override
public Terms terms(String field) throws IOException {
FreqProxTermsWriterPerField perField = fields.get(field);
return perField == null ? null : new FreqProxTerms(perField, liveDocs);
return perField == null ? null : new FreqProxTerms(perField);
}
@Override
@ -68,11 +62,9 @@ class FreqProxFields extends Fields {
private static class FreqProxTerms extends Terms {
final FreqProxTermsWriterPerField terms;
final Bits liveDocs;
public FreqProxTerms(FreqProxTermsWriterPerField terms, Bits liveDocs) {
public FreqProxTerms(FreqProxTermsWriterPerField terms) {
this.terms = terms;
this.liveDocs = liveDocs;
}
@Override
@ -80,9 +72,8 @@ class FreqProxFields extends Fields {
FreqProxTermsEnum termsEnum;
if (reuse instanceof FreqProxTermsEnum && ((FreqProxTermsEnum) reuse).terms == this.terms) {
termsEnum = (FreqProxTermsEnum) reuse;
assert termsEnum.liveDocs == this.liveDocs;
} else {
termsEnum = new FreqProxTermsEnum(terms, liveDocs);
termsEnum = new FreqProxTermsEnum(terms);
}
termsEnum.reset();
return termsEnum;
@ -145,13 +136,11 @@ class FreqProxFields extends Fields {
final FreqProxPostingsArray postingsArray;
final BytesRef scratch = new BytesRef();
final int numTerms;
final Bits liveDocs;
int ord;
public FreqProxTermsEnum(FreqProxTermsWriterPerField terms, Bits liveDocs) {
public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
this.terms = terms;
this.numTerms = terms.bytesHash.size();
this.liveDocs = liveDocs;
sortedTermIDs = terms.sortedTermIDs;
assert sortedTermIDs != null;
postingsArray = (FreqProxPostingsArray) terms.postingsArray;
@ -239,8 +228,8 @@ class FreqProxFields extends Fields {
}
@Override
public DocsEnum docs(Bits liveDocsIn, DocsEnum reuse, int flags) {
if (liveDocsIn != null) {
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
if (liveDocs != null) {
throw new IllegalArgumentException("liveDocs must be null");
}
@ -255,20 +244,18 @@ class FreqProxFields extends Fields {
if (reuse instanceof FreqProxDocsEnum) {
docsEnum = (FreqProxDocsEnum) reuse;
if (docsEnum.postingsArray != postingsArray) {
docsEnum = new FreqProxDocsEnum(terms, postingsArray, liveDocs);
} else {
assert docsEnum.liveDocs == liveDocs;
docsEnum = new FreqProxDocsEnum(terms, postingsArray);
}
} else {
docsEnum = new FreqProxDocsEnum(terms, postingsArray, liveDocs);
docsEnum = new FreqProxDocsEnum(terms, postingsArray);
}
docsEnum.reset(sortedTermIDs[ord]);
return docsEnum;
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocsIn, DocsAndPositionsEnum reuse, int flags) {
if (liveDocsIn != null) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) {
if (liveDocs != null) {
throw new IllegalArgumentException("liveDocs must be null");
}
FreqProxDocsAndPositionsEnum posEnum;
@ -288,12 +275,10 @@ class FreqProxFields extends Fields {
if (reuse instanceof FreqProxDocsAndPositionsEnum) {
posEnum = (FreqProxDocsAndPositionsEnum) reuse;
if (posEnum.postingsArray != postingsArray) {
posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray, liveDocs);
} else {
assert posEnum.liveDocs == liveDocs;
posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray);
}
} else {
posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray, liveDocs);
posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray);
}
posEnum.reset(sortedTermIDs[ord]);
return posEnum;
@ -326,17 +311,15 @@ class FreqProxFields extends Fields {
final FreqProxPostingsArray postingsArray;
final ByteSliceReader reader = new ByteSliceReader();
final boolean readTermFreq;
final Bits liveDocs;
int docID;
int freq;
boolean ended;
int termID;
public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray, Bits liveDocs) {
public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
this.terms = terms;
this.postingsArray = postingsArray;
this.readTermFreq = terms.hasFreq;
this.liveDocs = liveDocs;
}
public void reset(int termID) {
@ -391,10 +374,6 @@ class FreqProxFields extends Fields {
assert docID != postingsArray.lastDocIDs[termID];
}
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
return docID;
}
}
@ -417,7 +396,6 @@ class FreqProxFields extends Fields {
final ByteSliceReader reader = new ByteSliceReader();
final ByteSliceReader posReader = new ByteSliceReader();
final boolean readOffsets;
final Bits liveDocs;
int docID;
int freq;
int pos;
@ -429,11 +407,10 @@ class FreqProxFields extends Fields {
boolean hasPayload;
BytesRef payload = new BytesRef();
public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray, Bits liveDocs) {
public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
this.terms = terms;
this.postingsArray = postingsArray;
this.readOffsets = terms.hasOffsets;
this.liveDocs = liveDocs;
assert terms.hasProx;
assert terms.hasFreq;
}
@ -487,9 +464,6 @@ class FreqProxFields extends Fields {
posLeft = freq;
pos = 0;
startOffset = 0;
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
return docID;
}

View File

@ -103,10 +103,6 @@ final class FreqProxTermsWriter extends TermsHash {
applyDeletes(state, fields);
if (state.liveDocs != null) {
fields.setLiveDocs(state.liveDocs);
}
FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
boolean success = false;
try {