From 1621816d81904e98ab807893e92cff648774aaa3 Mon Sep 17 00:00:00 2001 From: Han Jiang Date: Fri, 23 Aug 2013 14:34:47 +0000 Subject: [PATCH] LUCENE-3069: merge 'temp' codes back git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1516860 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/blockterms/BlockTermsReader.java | 44 +++-- .../codecs/blockterms/BlockTermsWriter.java | 51 +++-- .../codecs/pulsing/PulsingPostingsFormat.java | 4 +- .../codecs/pulsing/PulsingPostingsReader.java | 94 +++++---- .../codecs/pulsing/PulsingPostingsWriter.java | 163 +++++++++------ .../lucene/codecs/sep/SepPostingsReader.java | 55 ++---- .../lucene/codecs/sep/SepPostingsWriter.java | 187 ++++++++---------- .../lucene/codecs/BlockTreeTermsReader.java | 89 ++++++--- .../lucene/codecs/BlockTreeTermsWriter.java | 126 ++++++++---- .../lucene/codecs/PostingsReaderBase.java | 8 +- .../lucene/codecs/PostingsWriterBase.java | 31 +-- .../lucene40/Lucene40PostingsReader.java | 51 +---- .../lucene41/Lucene41PostingsReader.java | 93 ++------- .../lucene41/Lucene41PostingsWriter.java | 166 +++++++--------- .../lucene40/Lucene40PostingsWriter.java | 109 ++++------ .../mockrandom/MockRandomPostingsFormat.java | 59 +++++- .../NestedPulsingPostingsFormat.java | 8 +- .../util/TestRuleSetupAndRestoreClassEnv.java | 2 +- 18 files changed, 698 insertions(+), 642 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java index 7fa0e14d55c..2697cfebf97 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java @@ -142,6 +142,7 @@ public class BlockTermsReader extends FieldsProducer { final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumDocFreq = in.readVLong(); final int docCount = in.readVInt(); + final int longsSize = in.readVInt(); if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")"); } @@ -151,7 +152,7 @@ public class BlockTermsReader extends FieldsProducer { if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")"); } - FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount)); + FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize)); if (previous != null) { throw new CorruptIndexException("duplicate fields: " + fieldInfo.name + " (resource=" + in + ")"); } @@ -230,8 +231,9 @@ public class BlockTermsReader extends FieldsProducer { final long sumTotalTermFreq; final long sumDocFreq; final int docCount; + final int longsSize; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; @@ -239,6 +241,7 @@ public class BlockTermsReader extends FieldsProducer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; } @Override @@ -326,6 +329,10 @@ public class BlockTermsReader extends FieldsProducer { private final ByteArrayDataInput freqReader = new ByteArrayDataInput(); private int metaDataUpto; + private long[] longs; + private byte[] bytes; + private ByteArrayDataInput bytesReader; + public SegmentTermsEnum() throws IOException { in = BlockTermsReader.this.in.clone(); in.seek(termsStartPointer); @@ -339,6 +346,7 @@ public class BlockTermsReader extends FieldsProducer { termSuffixes = new byte[128]; docFreqBytes = new byte[64]; //System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader); + longs = new long[longsSize]; } @Override @@ -415,7 +423,7 @@ public class BlockTermsReader extends FieldsProducer { assert result; indexIsCurrent = true; - didIndexNext = false; + didIndexNext = false; if (doOrd) { state.ord = indexEnum.ord()-1; @@ -789,12 +797,21 @@ public class BlockTermsReader extends FieldsProducer { //System.out.println(" freq bytes len=" + len); in.readBytes(docFreqBytes, 0, len); freqReader.reset(docFreqBytes, 0, len); + + // metadata + len = in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(len, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < len) { + bytes = new byte[ArrayUtil.oversize(len, 1)]; + } + in.readBytes(bytes, 0, len); + bytesReader.reset(bytes, 0, len); + metaDataUpto = 0; - state.termBlockOrd = 0; - postingsReader.readTermsBlock(in, fieldInfo, state); - indexIsCurrent = false; //System.out.println(" indexIsCurrent=" + indexIsCurrent); @@ -811,9 +828,7 @@ public class BlockTermsReader extends FieldsProducer { // lazily catch up on metadata decode: final int limit = state.termBlockOrd; - // We must set/incr state.termCount because - // postings impl can look at this - state.termBlockOrd = metaDataUpto; + boolean absolute = metaDataUpto == 0; // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { //System.out.println(" decode mdUpto=" + metaDataUpto); @@ -825,16 +840,21 @@ public class BlockTermsReader extends FieldsProducer { // TODO: if docFreq were bulk decoded we could // just skipN here: + + // docFreq, totalTermFreq state.docFreq = freqReader.readVInt(); //System.out.println(" dF=" + state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { state.totalTermFreq = state.docFreq + freqReader.readVLong(); //System.out.println(" totTF=" + state.totalTermFreq); } - - postingsReader.nextTerm(fieldInfo, state); + // metadata + for (int i = 0; i < longs.length; i++) { + longs[i] = bytesReader.readVLong(); + } + postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute); metaDataUpto++; - state.termBlockOrd++; + absolute = false; } } else { //System.out.println(" skip! seekPending"); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index 367e45e2b1e..7871332b92b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -27,6 +27,7 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; @@ -77,8 +78,9 @@ public class BlockTermsWriter extends FieldsConsumer { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; + public final int longsSize; - public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { + public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.termsStartPointer = termsStartPointer; @@ -86,6 +88,7 @@ public class BlockTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; } } @@ -109,7 +112,7 @@ public class BlockTermsWriter extends FieldsConsumer { //System.out.println("BTW.init seg=" + state.segmentName); - postingsWriter.start(out); // have consumer write its format/header + postingsWriter.init(out); // have consumer write its format/header success = true; } finally { if (!success) { @@ -148,6 +151,7 @@ public class BlockTermsWriter extends FieldsConsumer { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); } writeTrailer(dirStart); } finally { @@ -161,7 +165,7 @@ public class BlockTermsWriter extends FieldsConsumer { private static class TermEntry { public final BytesRef term = new BytesRef(); - public TermStats stats; + public BlockTermState state; } class TermsWriter extends TermsConsumer { @@ -173,6 +177,7 @@ public class BlockTermsWriter extends FieldsConsumer { long sumTotalTermFreq; long sumDocFreq; int docCount; + int longsSize; private TermEntry[] pendingTerms; @@ -190,8 +195,8 @@ public class BlockTermsWriter extends FieldsConsumer { pendingTerms[i] = new TermEntry(); } termsStartPointer = out.getFilePointer(); - postingsWriter.setField(fieldInfo); this.postingsWriter = postingsWriter; + this.longsSize = postingsWriter.setField(fieldInfo); } @Override @@ -237,11 +242,12 @@ public class BlockTermsWriter extends FieldsConsumer { } final TermEntry te = pendingTerms[pendingCount]; te.term.copyBytes(text); - te.stats = stats; + te.state = postingsWriter.newTermState(); + te.state.docFreq = stats.docFreq; + te.state.totalTermFreq = stats.totalTermFreq; + postingsWriter.finishTerm(te.state); pendingCount++; - - postingsWriter.finishTerm(stats); numTerms++; } @@ -264,7 +270,8 @@ public class BlockTermsWriter extends FieldsConsumer { termsStartPointer, sumTotalTermFreq, sumDocFreq, - docCount)); + docCount, + longsSize)); } } @@ -285,6 +292,7 @@ public class BlockTermsWriter extends FieldsConsumer { } private final RAMOutputStream bytesWriter = new RAMOutputStream(); + private final RAMOutputStream bufferWriter = new RAMOutputStream(); private void flushBlock() throws IOException { //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer()); @@ -318,19 +326,34 @@ public class BlockTermsWriter extends FieldsConsumer { // TODO: cutover to better intblock codec. simple64? // write prefix, suffix first: for(int termCount=0;termCount fields; - public PulsingPostingsReader(PostingsReaderBase wrappedPostingsReader) { + public PulsingPostingsReader(SegmentReadState state, PostingsReaderBase wrappedPostingsReader) { this.wrappedPostingsReader = wrappedPostingsReader; + this.segmentState = state; } @Override public void init(IndexInput termsIn) throws IOException { - CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC, - PulsingPostingsWriter.VERSION_START, PulsingPostingsWriter.VERSION_START); + version = CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC, + PulsingPostingsWriter.VERSION_START, + PulsingPostingsWriter.VERSION_CURRENT); maxPositions = termsIn.readVInt(); wrappedPostingsReader.init(termsIn); + if (wrappedPostingsReader instanceof PulsingPostingsReader || + version < PulsingPostingsWriter.VERSION_META_ARRAY) { + fields = null; + } else { + fields = new TreeMap(); + String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, PulsingPostingsWriter.SUMMARY_EXTENSION); + IndexInput in = null; + try { + in = segmentState.directory.openInput(summaryFileName, segmentState.context); + CodecUtil.checkHeader(in, PulsingPostingsWriter.CODEC, version, + PulsingPostingsWriter.VERSION_CURRENT); + int numField = in.readVInt(); + for (int i = 0; i < numField; i++) { + int fieldNum = in.readVInt(); + int longsSize = in.readVInt(); + fields.put(fieldNum, longsSize); + } + } finally { + IOUtils.closeWhileHandlingException(in); + } + } } private static class PulsingTermState extends BlockTermState { + private boolean absolute = false; + private long[] longs; private byte[] postings; private int postingsSize; // -1 if this term was not inlined private BlockTermState wrappedTermState; - ByteArrayDataInput inlinedBytesReader; - private byte[] inlinedBytes; - @Override public PulsingTermState clone() { PulsingTermState clone; @@ -82,6 +112,11 @@ public class PulsingPostingsReader extends PostingsReaderBase { } else { assert wrappedTermState != null; clone.wrappedTermState = (BlockTermState) wrappedTermState.clone(); + clone.absolute = absolute; + if (longs != null) { + clone.longs = new long[longs.length]; + System.arraycopy(longs, 0, clone.longs, 0, longs.length); + } } return clone; } @@ -99,11 +134,6 @@ public class PulsingPostingsReader extends PostingsReaderBase { } else { wrappedTermState.copyFrom(other.wrappedTermState); } - - // NOTE: we do not copy the - // inlinedBytes/inlinedBytesReader; these are only - // stored on the "primary" TermState. They are - // "transient" to cloned term states. } @Override @@ -116,25 +146,6 @@ public class PulsingPostingsReader extends PostingsReaderBase { } } - @Override - public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { - //System.out.println("PR.readTermsBlock state=" + _termState); - final PulsingTermState termState = (PulsingTermState) _termState; - if (termState.inlinedBytes == null) { - termState.inlinedBytes = new byte[128]; - termState.inlinedBytesReader = new ByteArrayDataInput(); - } - int len = termsIn.readVInt(); - //System.out.println(" len=" + len + " fp=" + termsIn.getFilePointer()); - if (termState.inlinedBytes.length < len) { - termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)]; - } - termsIn.readBytes(termState.inlinedBytes, 0, len); - termState.inlinedBytesReader.reset(termState.inlinedBytes); - termState.wrappedTermState.termBlockOrd = 0; - wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState); - } - @Override public BlockTermState newTermState() throws IOException { PulsingTermState state = new PulsingTermState(); @@ -143,20 +154,20 @@ public class PulsingPostingsReader extends PostingsReaderBase { } @Override - public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException { + public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { //System.out.println("PR nextTerm"); PulsingTermState termState = (PulsingTermState) _termState; - + assert empty.length == 0; + termState.absolute = termState.absolute || absolute; // if we have positions, its total TF, otherwise its computed based on docFreq. long count = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq; //System.out.println(" count=" + count + " threshold=" + maxPositions); if (count <= maxPositions) { - // Inlined into terms dict -- just read the byte[] blob in, // but don't decode it now (we only decode when a DocsEnum // or D&PEnum is pulled): - termState.postingsSize = termState.inlinedBytesReader.readVInt(); + termState.postingsSize = in.readVInt(); if (termState.postings == null || termState.postings.length < termState.postingsSize) { termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)]; } @@ -164,16 +175,23 @@ public class PulsingPostingsReader extends PostingsReaderBase { // (the blob holding all inlined terms' blobs for // current term block) into another byte[] (just the // blob for this term)... - termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize); + in.readBytes(termState.postings, 0, termState.postingsSize); //System.out.println(" inlined bytes=" + termState.postingsSize); + termState.absolute = termState.absolute || absolute; } else { //System.out.println(" not inlined"); + final int longsSize = fields == null ? 0 : fields.get(fieldInfo.number); + if (termState.longs == null) { + termState.longs = new long[longsSize]; + } + for (int i = 0; i < longsSize; i++) { + termState.longs[i] = in.readVLong(); + } termState.postingsSize = -1; - // TODO: should we do full copyFrom? much heavier...? termState.wrappedTermState.docFreq = termState.docFreq; termState.wrappedTermState.totalTermFreq = termState.totalTermFreq; - wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState); - termState.wrappedTermState.termBlockOrd++; + wrappedPostingsReader.decodeTerm(termState.longs, in, fieldInfo, termState.wrappedTermState, termState.absolute); + termState.absolute = false; } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java index 6ba0ef698cc..1228aeaa511 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java @@ -21,14 +21,19 @@ import java.io.IOException; import java.util.List; import java.util.ArrayList; +import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; // TODO: we now inline based on total TF of the term, // but it might be better to inline by "net bytes used" @@ -49,26 +54,43 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { final static String CODEC = "PulsedPostingsWriter"; + // recording field summary + final static String SUMMARY_EXTENSION = "smy"; + // To add a new version, increment from the last one, and // change VERSION_CURRENT to point to your new version: final static int VERSION_START = 0; - final static int VERSION_CURRENT = VERSION_START; + final static int VERSION_META_ARRAY = 0; + final static int VERSION_CURRENT = VERSION_META_ARRAY; + + private SegmentWriteState segmentState; private IndexOutput termsOut; + private List fields; + private IndexOptions indexOptions; private boolean storePayloads; - private static class PendingTerm { - private final byte[] bytes; - public PendingTerm(byte[] bytes) { - this.bytes = bytes; + // information for wrapped PF, in current field + private int longsSize; + private long[] longs; + boolean absolute; + + private static class PulsingTermState extends BlockTermState { + private byte[] bytes; + private BlockTermState wrappedState; + @Override + public String toString() { + if (bytes != null) { + return "inlined"; + } else { + return "not inlined wrapped=" + wrappedState; + } } } - private final List pendingTerms = new ArrayList(); - // one entry per position private final Position[] pending; private int pendingCount = 0; // -1 once we've hit too many positions @@ -83,6 +105,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { int endOffset; } + private static final class FieldMetaData { + int fieldNumber; + int longsSize; + FieldMetaData(int number, int size) { + fieldNumber = number; + longsSize = size; + } + } + // TODO: -- lazy init this? ie, if every single term // was inlined (eg for a "primary key" field) then we // never need to use this fallback? Fallback writer for @@ -92,23 +123,33 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { /** If the total number of positions (summed across all docs * for this term) is <= maxPositions, then the postings are * inlined into terms dict */ - public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) { + public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) { + pending = new Position[maxPositions]; for(int i=0;i(); // We simply wrap another postings writer, but only call // on it when tot positions is >= the cutoff: this.wrappedPostingsWriter = wrappedPostingsWriter; + this.segmentState = state; } @Override - public void start(IndexOutput termsOut) throws IOException { + public void init(IndexOutput termsOut) throws IOException { this.termsOut = termsOut; CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); termsOut.writeVInt(pending.length); // encode maxPositions in header - wrappedPostingsWriter.start(termsOut); + wrappedPostingsWriter.init(termsOut); + } + + @Override + public BlockTermState newTermState() throws IOException { + PulsingTermState state = new PulsingTermState(); + state.wrappedState = wrappedPostingsWriter.newTermState(); + return state; } @Override @@ -123,11 +164,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { // Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { this.indexOptions = fieldInfo.getIndexOptions(); //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions); storePayloads = fieldInfo.hasPayloads(); - wrappedPostingsWriter.setField(fieldInfo); + absolute = false; + longsSize = wrappedPostingsWriter.setField(fieldInfo); + longs = new long[longsSize]; + fields.add(new FieldMetaData(fieldInfo.number, longsSize)); + return 0; //DEBUG = BlockTreeTermsWriter.DEBUG; } @@ -219,18 +264,19 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { /** Called when we are done adding docs to this term */ @Override - public void finishTerm(TermStats stats) throws IOException { + public void finishTerm(BlockTermState _state) throws IOException { + PulsingTermState state = (PulsingTermState) _state; + // if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size()); assert pendingCount > 0 || pendingCount == -1; if (pendingCount == -1) { - wrappedPostingsWriter.finishTerm(stats); - // Must add null entry to record terms that our - // wrapped postings impl added - pendingTerms.add(null); + state.wrappedState.docFreq = state.docFreq; + state.wrappedState.totalTermFreq = state.totalTermFreq; + state.bytes = null; + wrappedPostingsWriter.finishTerm(state.wrappedState); } else { - // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: @@ -325,61 +371,54 @@ public final class PulsingPostingsWriter extends PostingsWriterBase { } } - final byte[] bytes = new byte[(int) buffer.getFilePointer()]; - buffer.writeTo(bytes, 0); - pendingTerms.add(new PendingTerm(bytes)); + state.bytes = new byte[(int) buffer.getFilePointer()]; + buffer.writeTo(state.bytes, 0); buffer.reset(); } - pendingCount = 0; } + @Override + public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + PulsingTermState state = (PulsingTermState)_state; + assert empty.length == 0; + this.absolute = this.absolute || absolute; + if (state.bytes == null) { + wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute); + for (int i = 0; i < longsSize; i++) { + out.writeVLong(longs[i]); + } + buffer.writeTo(out); + buffer.reset(); + this.absolute = false; + } else { + out.writeVInt(state.bytes.length); + out.writeBytes(state.bytes, 0, state.bytes.length); + this.absolute = this.absolute || absolute; + } + } + @Override public void close() throws IOException { wrappedPostingsWriter.close(); - } - - @Override - public void flushTermsBlock(int start, int count) throws IOException { - // if (DEBUG) System.out.println("PW: flushTermsBlock start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size()); - int wrappedCount = 0; - assert buffer.getFilePointer() == 0; - assert start >= count; - - final int limit = pendingTerms.size() - start + count; - - for(int idx=pendingTerms.size()-start; idx= 0) { @@ -195,6 +197,24 @@ public final class SepPostingsWriter extends PostingsWriterBase { } skipListWriter.setIndexOptions(indexOptions); storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads(); + lastPayloadFP = 0; + lastSkipFP = 0; + lastState = setEmptyState(); + return 0; + } + + private SepTermState setEmptyState() { + SepTermState emptyState = new SepTermState(); + emptyState.docIndex = docOut.index(); + if (indexOptions != IndexOptions.DOCS_ONLY) { + emptyState.freqIndex = freqOut.index(); + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + emptyState.posIndex = posOut.index(); + } + } + emptyState.payloadFP = 0; + emptyState.skipFP = 0; + return emptyState; } /** Adds a new doc in this term. If this returns null @@ -262,135 +282,86 @@ public final class SepPostingsWriter extends PostingsWriterBase { lastPosition = 0; } - private static class PendingTerm { - public final IntIndexOutput.Index docIndex; - public final IntIndexOutput.Index freqIndex; - public final IntIndexOutput.Index posIndex; + private static class SepTermState extends BlockTermState { + public IntIndexOutput.Index docIndex; + public IntIndexOutput.Index freqIndex; + public IntIndexOutput.Index posIndex; public long payloadFP; public long skipFP; - - public PendingTerm(IntIndexOutput.Index docIndex, IntIndexOutput.Index freqIndex, IntIndexOutput.Index posIndex, long payloadFP, long skipFP) { - this.docIndex = docIndex; - this.freqIndex = freqIndex; - this.posIndex = posIndex; - this.payloadFP = payloadFP; - this.skipFP = skipFP; - } } - private final List pendingTerms = new ArrayList(); - /** Called when we are done adding docs to this term */ @Override - public void finishTerm(TermStats stats) throws IOException { + public void finishTerm(BlockTermState _state) throws IOException { + SepTermState state = (SepTermState)_state; // TODO: -- wasteful we are counting this in two places? - assert stats.docFreq > 0; - assert stats.docFreq == df; + assert state.docFreq > 0; + assert state.docFreq == df; - final IntIndexOutput.Index docIndexCopy = docOut.index(); - docIndexCopy.copyFrom(docIndex, false); - - final IntIndexOutput.Index freqIndexCopy; - final IntIndexOutput.Index posIndexCopy; + state.docIndex = docOut.index(); + state.docIndex.copyFrom(docIndex, false); if (indexOptions != IndexOptions.DOCS_ONLY) { - freqIndexCopy = freqOut.index(); - freqIndexCopy.copyFrom(freqIndex, false); + state.freqIndex = freqOut.index(); + state.freqIndex.copyFrom(freqIndex, false); if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - posIndexCopy = posOut.index(); - posIndexCopy.copyFrom(posIndex, false); + state.posIndex = posOut.index(); + state.posIndex.copyFrom(posIndex, false); } else { - posIndexCopy = null; + state.posIndex = null; } } else { - freqIndexCopy = null; - posIndexCopy = null; + state.freqIndex = null; + state.posIndex = null; } - final long skipFP; if (df >= skipMinimum) { - skipFP = skipOut.getFilePointer(); + state.skipFP = skipOut.getFilePointer(); //System.out.println(" skipFP=" + skipFP); skipListWriter.writeSkip(skipOut); //System.out.println(" numBytes=" + (skipOut.getFilePointer()-skipFP)); } else { - skipFP = -1; + state.skipFP = -1; } + state.payloadFP = payloadStart; lastDocID = 0; df = 0; - - pendingTerms.add(new PendingTerm(docIndexCopy, - freqIndexCopy, - posIndexCopy, - payloadStart, - skipFP)); } @Override - public void flushTermsBlock(int start, int count) throws IOException { - //System.out.println("SEPW: flushTermsBlock: start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size() + " termsOut.fp=" + termsOut.getFilePointer()); - assert indexBytesWriter.getFilePointer() == 0; - final int absStart = pendingTerms.size() - start; - final List slice = pendingTerms.subList(absStart, absStart+count); - - if (count == 0) { - termsOut.writeByte((byte) 0); - return; + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + SepTermState state = (SepTermState)_state; + if (absolute) { + lastSkipFP = 0; + lastPayloadFP = 0; + lastState = state; } - - long lastSkipFP = 0; - long lastPayloadFP = 0; - - boolean isFirstTerm = true; - - for(int idx=0;idx index; + final int longsSize; + private final FST index; //private boolean DEBUG; - FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException { + FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -462,6 +464,7 @@ public class BlockTreeTermsReader extends FieldsProducer { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; + this.longsSize = longsSize; // if (DEBUG) { // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); // } @@ -612,6 +615,12 @@ public class BlockTreeTermsReader extends FieldsProducer { FST.Arc arc; final BlockTermState termState; + + // metadata buffer, holding monotonical values + public long[] longs; + // metadata buffer, holding general values + public byte[] bytes; + ByteArrayDataInput bytesReader; // Cumulative output so far BytesRef outputPrefix; @@ -621,8 +630,9 @@ public class BlockTreeTermsReader extends FieldsProducer { public Frame(int ord) throws IOException { this.ord = ord; - termState = postingsReader.newTermState(); - termState.totalTermFreq = -1; + this.termState = postingsReader.newTermState(); + this.termState.totalTermFreq = -1; + this.longs = new long[longsSize]; } void loadNextFloorBlock() throws IOException { @@ -720,8 +730,17 @@ public class BlockTreeTermsReader extends FieldsProducer { termState.termBlockOrd = 0; nextEnt = 0; - - postingsReader.readTermsBlock(in, fieldInfo, termState); + + // metadata + numBytes = in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); if (!isLastInFloor) { // Sub-blocks of a single floor block are always @@ -774,12 +793,9 @@ public class BlockTreeTermsReader extends FieldsProducer { // lazily catch up on metadata decode: final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; assert limit > 0; - // We must set/incr state.termCount because - // postings impl can look at this - termState.termBlockOrd = metaDataUpto; - // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { @@ -791,17 +807,24 @@ public class BlockTreeTermsReader extends FieldsProducer { // TODO: if docFreq were bulk decoded we could // just skipN here: + + // stats termState.docFreq = statsReader.readVInt(); //if (DEBUG) System.out.println(" dF=" + state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } + // metadata + for (int i = 0; i < longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + postingsReader.decodeTerm(longs, bytesReader, fieldInfo, termState, absolute); - postingsReader.nextTerm(fieldInfo, termState); metaDataUpto++; - termState.termBlockOrd++; + absolute = false; } + termState.termBlockOrd = metaDataUpto; } } @@ -1707,6 +1730,7 @@ public class BlockTreeTermsReader extends FieldsProducer { if (arc.output != NO_OUTPUT) { output = fstOutputs.add(output, arc.output); } + // if (DEBUG) { // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); // } @@ -2290,10 +2314,17 @@ public class BlockTreeTermsReader extends FieldsProducer { final BlockTermState state; + // metadata buffer, holding monotonical values + public long[] longs; + // metadata buffer, holding general values + public byte[] bytes; + ByteArrayDataInput bytesReader; + public Frame(int ord) throws IOException { this.ord = ord; - state = postingsReader.newTermState(); - state.totalTermFreq = -1; + this.state = postingsReader.newTermState(); + this.state.totalTermFreq = -1; + this.longs = new long[longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -2391,7 +2422,17 @@ public class BlockTreeTermsReader extends FieldsProducer { // TODO: we could skip this if !hasTerms; but // that's rare so won't help much - postingsReader.readTermsBlock(in, fieldInfo, state); + // metadata + numBytes = in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + // Sub-blocks of a single floor block are always // written one after another -- tail recurse: @@ -2575,12 +2616,9 @@ public class BlockTreeTermsReader extends FieldsProducer { // lazily catch up on metadata decode: final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; assert limit > 0; - // We must set/incr state.termCount because - // postings impl can look at this - state.termBlockOrd = metaDataUpto; - // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { @@ -2592,17 +2630,24 @@ public class BlockTreeTermsReader extends FieldsProducer { // TODO: if docFreq were bulk decoded we could // just skipN here: + + // stats state.docFreq = statsReader.readVInt(); //if (DEBUG) System.out.println(" dF=" + state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { state.totalTermFreq = state.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } + // metadata + for (int i = 0; i < longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute); - postingsReader.nextTerm(fieldInfo, state); metaDataUpto++; - state.termBlockOrd++; + absolute = false; } + state.termBlockOrd = metaDataUpto; } // Used only by assert diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index 0074894625c..bf1e1619427 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -104,13 +104,12 @@ import org.apache.lucene.util.packed.PackedInts; * and decoding the Postings Metadata and Term Metadata sections.

* *
    - * - *
  • TermsDict (.tim) --> Header, Postings Metadata, BlockNumBlocks, + *
  • TermsDict (.tim) --> Header, Postings Header, NodeBlockNumBlocks, * FieldSummary, DirOffset
  • - *
  • Block --> SuffixBlock, StatsBlock, MetadataBlock
  • - *
  • SuffixBlock --> EntryCount, SuffixLength, ByteSuffixLength
  • - *
  • StatsBlock --> StatsLength, <DocFreq, TotalTermFreq>EntryCount
  • - *
  • MetadataBlock --> MetaLength, <Term Metadata>EntryCount
  • + *
  • NodeBlock --> (OuterNode | InnerNode)
  • + *
  • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <Term Metadata>EntryCount
  • + *
  • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, < TermStats ? >EntryCount, MetaLength, <Term Metadata ? >EntryCount
  • + *
  • TermStats --> DocFreq, TotalTermFreq
  • *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength, * SumDocFreq, DocCount>NumFields
  • *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • @@ -136,7 +135,9 @@ import org.apache.lucene.util.packed.PackedInts; *
  • DocCount is the number of documents that have at least one posting for this field.
  • *
  • PostingsMetadata and TermMetadata are plugged into by the specific postings implementation: * these contain arbitrary per-file data (such as parameters or versioning information) - * and per-term data (such as pointers to inverted files). + * and per-term data (such as pointers to inverted files).
  • + *
  • For inner nodes of the tree, every entry will steal one bit to mark whether it points + * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted
  • *
* *

Term Index

@@ -237,8 +238,9 @@ public class BlockTreeTermsWriter extends FieldsConsumer { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; + private final int longsSize; - public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount) { + public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { assert numTerms > 0; this.fieldInfo = fieldInfo; assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms; @@ -248,6 +250,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; } } @@ -300,7 +303,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // System.out.println("BTW.init seg=" + state.segmentName); - postingsWriter.start(out); // have consumer write its format/header + postingsWriter.init(out); // have consumer write its format/header success = true; } finally { if (!success) { @@ -354,12 +357,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer { private static final class PendingTerm extends PendingEntry { public final BytesRef term; - public final TermStats stats; + // stats + metadata + public final BlockTermState state; - public PendingTerm(BytesRef term, TermStats stats) { + public PendingTerm(BytesRef term, BlockTermState state) { super(true); this.term = term; - this.stats = stats; + this.state = state; } @Override @@ -480,6 +484,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { class TermsWriter extends TermsConsumer { private final FieldInfo fieldInfo; + private final int longsSize; private long numTerms; long sumTotalTermFreq; long sumDocFreq; @@ -839,11 +844,16 @@ public class BlockTreeTermsWriter extends FieldsConsumer { final List> subIndices; int termCount; + + long[] longs = new long[longsSize]; + boolean absolute = true; + if (isLeafBlock) { subIndices = null; for (PendingEntry ent : slice) { assert ent.isTerm; PendingTerm term = (PendingTerm) ent; + BlockTermState state = term.state; final int suffix = term.term.length - prefixLength; // if (DEBUG) { // BytesRef suffixBytes = new BytesRef(suffix); @@ -852,15 +862,25 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // System.out.println(" write term suffix=" + suffixBytes); // } // For leaf block we write suffix straight - bytesWriter.writeVInt(suffix); - bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix); + suffixWriter.writeVInt(suffix); + suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix); // Write term stats, to separate byte[] blob: - bytesWriter2.writeVInt(term.stats.docFreq); + statsWriter.writeVInt(state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq; - bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); + assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq; + statsWriter.writeVLong(state.totalTermFreq - state.docFreq); } + + // Write term meta data + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; } termCount = length; } else { @@ -869,6 +889,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { for (PendingEntry ent : slice) { if (ent.isTerm) { PendingTerm term = (PendingTerm) ent; + BlockTermState state = term.state; final int suffix = term.term.length - prefixLength; // if (DEBUG) { // BytesRef suffixBytes = new BytesRef(suffix); @@ -878,16 +899,34 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // } // For non-leaf block we borrow 1 bit to record // if entry is term or sub-block - bytesWriter.writeVInt(suffix<<1); - bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix); + suffixWriter.writeVInt(suffix<<1); + suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix); // Write term stats, to separate byte[] blob: - bytesWriter2.writeVInt(term.stats.docFreq); + statsWriter.writeVInt(state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - assert term.stats.totalTermFreq >= term.stats.docFreq; - bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); + assert state.totalTermFreq >= state.docFreq; + statsWriter.writeVLong(state.totalTermFreq - state.docFreq); } + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; + termCount++; } else { PendingBlock block = (PendingBlock) ent; @@ -897,8 +936,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // For non-leaf block we borrow 1 bit to record // if entry is term or sub-block - bytesWriter.writeVInt((suffix<<1)|1); - bytesWriter.writeBytes(block.prefix.bytes, prefixLength, suffix); + suffixWriter.writeVInt((suffix<<1)|1); + suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix); assert block.fp < startFP; // if (DEBUG) { @@ -908,7 +947,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor); // } - bytesWriter.writeVLong(startFP - block.fp); + suffixWriter.writeVLong(startFP - block.fp); subIndices.add(block.index); } } @@ -921,17 +960,19 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // search on lookup // Write suffixes byte[] blob to terms dict output: - out.writeVInt((int) (bytesWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); - bytesWriter.writeTo(out); - bytesWriter.reset(); + out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); + suffixWriter.writeTo(out); + suffixWriter.reset(); // Write term stats byte[] blob - out.writeVInt((int) bytesWriter2.getFilePointer()); - bytesWriter2.writeTo(out); - bytesWriter2.reset(); + out.writeVInt((int) statsWriter.getFilePointer()); + statsWriter.writeTo(out); + statsWriter.reset(); - // Have postings writer write block - postingsWriter.flushTermsBlock(futureTermCount+termCount, termCount); + // Write term meta data byte[] blob + out.writeVInt((int) metaWriter.getFilePointer()); + metaWriter.writeTo(out); + metaWriter.reset(); // Remove slice replaced by block: slice.clear(); @@ -967,7 +1008,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { PackedInts.COMPACT, true, 15); - postingsWriter.setField(fieldInfo); + this.longsSize = postingsWriter.setField(fieldInfo); } @Override @@ -998,8 +1039,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer { //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq); blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); - pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); - postingsWriter.finishTerm(stats); + BlockTermState state = postingsWriter.newTermState(); + state.docFreq = stats.docFreq; + state.totalTermFreq = stats.totalTermFreq; + postingsWriter.finishTerm(state); + + PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state); + pending.add(term); numTerms++; } @@ -1038,7 +1084,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer { indexStartFP, sumTotalTermFreq, sumDocFreq, - docCount)); + docCount, + longsSize)); } else { assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; assert sumDocFreq == 0; @@ -1046,8 +1093,10 @@ public class BlockTreeTermsWriter extends FieldsConsumer { } } + private final RAMOutputStream suffixWriter = new RAMOutputStream(); + private final RAMOutputStream statsWriter = new RAMOutputStream(); + private final RAMOutputStream metaWriter = new RAMOutputStream(); private final RAMOutputStream bytesWriter = new RAMOutputStream(); - private final RAMOutputStream bytesWriter2 = new RAMOutputStream(); } @Override @@ -1072,6 +1121,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); } writeTrailer(out, dirStart); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java index b8ea7f261ae..58c7a87253c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.util.Bits; /** The core terms dictionaries (BlockTermsReader, @@ -55,7 +56,7 @@ public abstract class PostingsReaderBase implements Closeable { public abstract BlockTermState newTermState() throws IOException; /** Actually decode metadata for next term */ - public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException; + public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; /** Must fully consume state, since after this call that * TermState may be reused. */ @@ -68,9 +69,4 @@ public abstract class PostingsReaderBase implements Closeable { @Override public abstract void close() throws IOException; - - /** Reads data for all terms in the next block; this - * method should merely load the byte[] blob but not - * decode, which is done in {@link #nextTerm}. */ - public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java index 0ed53e754de..3aeb3d3dccc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java @@ -20,6 +20,7 @@ package org.apache.lucene.codecs; import java.io.IOException; import java.io.Closeable; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.index.FieldInfo; @@ -48,25 +49,31 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo /** Called once after startup, before any terms have been * added. Implementations typically write a header to * the provided {@code termsOut}. */ - public abstract void start(IndexOutput termsOut) throws IOException; + public abstract void init(IndexOutput termsOut) throws IOException; + + /** Return a newly created empty TermState */ + public abstract BlockTermState newTermState() throws IOException; /** Start a new term. Note that a matching call to {@link - * #finishTerm(TermStats)} is done, only if the term has at least one + * #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one * document. */ public abstract void startTerm() throws IOException; - /** Flush count terms starting at start "backwards", as a - * block. start is a negative offset from the end of the - * terms stack, ie bigger start means further back in - * the stack. */ - public abstract void flushTermsBlock(int start, int count) throws IOException; - /** Finishes the current term. The provided {@link - * TermStats} contains the term's summary statistics. */ - public abstract void finishTerm(TermStats stats) throws IOException; + * BlockTermState} contains the term's summary statistics, + * and will holds metadata from PBF when returned */ + public abstract void finishTerm(BlockTermState state) throws IOException; - /** Called when the writing switches to another field. */ - public abstract void setField(FieldInfo fieldInfo); + /** + * Encode metadata as long[] and byte[]. {@code absolute} controls + * whether current term is delta encoded according to latest term. + */ + public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + + /** + * Return the fixed length of longs, + * called when the writing switches to another field. */ + public abstract int setField(FieldInfo fieldInfo); @Override public abstract void close() throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java index 9c2c86fc3fc..016cff7c332 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.TermState; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -121,11 +122,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase { long proxOffset; long skipOffset; - // Only used by the "primary" TermState -- clones don't - // copy this (basically they are "transient"): - ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...? - byte[] bytes; - @Override public StandardTermState clone() { StandardTermState other = new StandardTermState(); @@ -140,11 +136,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase { freqOffset = other.freqOffset; proxOffset = other.proxOffset; skipOffset = other.skipOffset; - - // Do not copy bytes, bytesReader (else TermState is - // very heavy, ie drags around the entire block's - // byte[]). On seek back, if next() is in fact used - // (rare!), they will be re-read from disk. } @Override @@ -171,38 +162,18 @@ public class Lucene40PostingsReader extends PostingsReaderBase { } } - /* Reads but does not decode the byte[] blob holding - metadata for the current terms block */ @Override - public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { - final StandardTermState termState = (StandardTermState) _termState; - - final int len = termsIn.readVInt(); - - // if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState); - if (termState.bytes == null) { - termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; - termState.bytesReader = new ByteArrayDataInput(); - } else if (termState.bytes.length < len) { - termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; - } - - termsIn.readBytes(termState.bytes, 0, len); - termState.bytesReader.reset(termState.bytes, 0, len); - } - - @Override - public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final StandardTermState termState = (StandardTermState) _termState; // if (DEBUG) System.out.println("SPR: nextTerm seg=" + segment + " tbOrd=" + termState.termBlockOrd + " bytesReader.fp=" + termState.bytesReader.getPosition()); final boolean isFirstTerm = termState.termBlockOrd == 0; - - if (isFirstTerm) { - termState.freqOffset = termState.bytesReader.readVLong(); - } else { - termState.freqOffset += termState.bytesReader.readVLong(); + if (absolute) { + termState.freqOffset = 0; + termState.proxOffset = 0; } + + termState.freqOffset += in.readVLong(); /* if (DEBUG) { System.out.println(" dF=" + termState.docFreq); @@ -212,7 +183,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { assert termState.freqOffset < freqIn.length(); if (termState.docFreq >= skipMinimum) { - termState.skipOffset = termState.bytesReader.readVLong(); + termState.skipOffset = in.readVLong(); // if (DEBUG) System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length()); assert termState.freqOffset + termState.skipOffset < freqIn.length(); } else { @@ -220,11 +191,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { } if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { - if (isFirstTerm) { - termState.proxOffset = termState.bytesReader.readVLong(); - } else { - termState.proxOffset += termState.bytesReader.readVLong(); - } + termState.proxOffset += in.readVLong(); // if (DEBUG) System.out.println(" proxFP=" + termState.proxOffset); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index 500ab204d55..92b4880ac4b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -152,11 +152,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // freq is always implicitly totalTermFreq in this case. int singletonDocID; - // Only used by the "primary" TermState -- clones don't - // copy this (basically they are "transient"): - ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...? - byte[] bytes; - @Override public IntBlockTermState clone() { IntBlockTermState other = new IntBlockTermState(); @@ -174,11 +169,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { lastPosBlockOffset = other.lastPosBlockOffset; skipOffset = other.skipOffset; singletonDocID = other.singletonDocID; - - // Do not copy bytes, bytesReader (else TermState is - // very heavy, ie drags around the entire block's - // byte[]). On seek back, if next() is in fact used - // (rare!), they will be re-read from disk. } @Override @@ -197,78 +187,37 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { IOUtils.close(docIn, posIn, payIn); } - /* Reads but does not decode the byte[] blob holding - metadata for the current terms block */ @Override - public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { - final IntBlockTermState termState = (IntBlockTermState) _termState; - - final int numBytes = termsIn.readVInt(); - - if (termState.bytes == null) { - termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - termState.bytesReader = new ByteArrayDataInput(); - } else if (termState.bytes.length < numBytes) { - termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - - termsIn.readBytes(termState.bytes, 0, numBytes); - termState.bytesReader.reset(termState.bytes, 0, numBytes); - } - - @Override - public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; - final boolean isFirstTerm = termState.termBlockOrd == 0; final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; final boolean fieldHasPayloads = fieldInfo.hasPayloads(); - final DataInput in = termState.bytesReader; - if (isFirstTerm) { - if (termState.docFreq == 1) { - termState.singletonDocID = in.readVInt(); - termState.docStartFP = 0; - } else { - termState.singletonDocID = -1; - termState.docStartFP = in.readVLong(); - } - if (fieldHasPositions) { - termState.posStartFP = in.readVLong(); - if (termState.totalTermFreq > BLOCK_SIZE) { - termState.lastPosBlockOffset = in.readVLong(); - } else { - termState.lastPosBlockOffset = -1; - } - if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) { - termState.payStartFP = in.readVLong(); - } else { - termState.payStartFP = -1; - } + // nocommit: use old version + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + termState.docStartFP += longs[0]; + if (fieldHasPositions) { + termState.posStartFP += longs[1]; + if (fieldHasOffsets || fieldHasPayloads) { + termState.payStartFP += longs[2]; } + } + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); } else { - if (termState.docFreq == 1) { - termState.singletonDocID = in.readVInt(); + termState.singletonDocID = -1; + } + if (fieldHasPositions) { + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); } else { - termState.singletonDocID = -1; - termState.docStartFP += in.readVLong(); - } - if (fieldHasPositions) { - termState.posStartFP += in.readVLong(); - if (termState.totalTermFreq > BLOCK_SIZE) { - termState.lastPosBlockOffset = in.readVLong(); - } else { - termState.lastPosBlockOffset = -1; - } - if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) { - long delta = in.readVLong(); - if (termState.payStartFP == -1) { - termState.payStartFP = delta; - } else { - termState.payStartFP += delta; - } - } + termState.lastPosBlockOffset = -1; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java index 9e8728ff979..e020fc4ff92 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java @@ -25,14 +25,15 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; @@ -71,7 +72,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { final IndexOutput posOut; final IndexOutput payOut; - private IndexOutput termsOut; + final static IntBlockTermState emptyState = new IntBlockTermState(); + IntBlockTermState lastState; // How current field indexes postings: private boolean fieldHasFreqs; @@ -79,7 +81,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { private boolean fieldHasOffsets; private boolean fieldHasPayloads; - // Holds starting file pointers for each term: + // Holds starting file pointers for current term: private long docTermStartFP; private long posTermStartFP; private long payTermStartFP; @@ -188,21 +190,50 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { this(state, PackedInts.COMPACT); } + private final static class IntBlockTermState extends BlockTermState { + long docTermStartFP = 0; + long posTermStartFP = 0; + long payTermStartFP = 0; + long skipOffset = -1; + long lastPosBlockOffset = -1; + int singletonDocID = -1; + @Override + public String toString() { + return super.toString() + " docStartFP=" + docTermStartFP + " posStartFP=" + posTermStartFP + " payStartFP=" + payTermStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; + } + } + @Override - public void start(IndexOutput termsOut) throws IOException { - this.termsOut = termsOut; + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut) throws IOException { CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT); termsOut.writeVInt(BLOCK_SIZE); } + // nocommit better name? + @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { IndexOptions indexOptions = fieldInfo.getIndexOptions(); fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; fieldHasPayloads = fieldInfo.hasPayloads(); skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); + lastState = emptyState; + if (fieldHasPositions) { + if (fieldHasPayloads || fieldHasOffsets) { + return 3; // doc + pos + pay FP + } else { + return 2; // doc + pos FP + } + } else { + return 1; // doc FP + } } @Override @@ -348,37 +379,18 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { } } - private static class PendingTerm { - public final long docStartFP; - public final long posStartFP; - public final long payStartFP; - public final long skipOffset; - public final long lastPosBlockOffset; - public final int singletonDocID; - - public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) { - this.docStartFP = docStartFP; - this.posStartFP = posStartFP; - this.payStartFP = payStartFP; - this.skipOffset = skipOffset; - this.lastPosBlockOffset = lastPosBlockOffset; - this.singletonDocID = singletonDocID; - } - } - - private final List pendingTerms = new ArrayList(); - /** Called when we are done adding docs to this term */ @Override - public void finishTerm(TermStats stats) throws IOException { - assert stats.docFreq > 0; + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; // TODO: wasteful we are counting this (counting # docs // for this term) in two places? - assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount; + assert state.docFreq == docCount: state.docFreq + " vs " + docCount; // if (DEBUG) { - // System.out.println("FPW.finishTerm docFreq=" + stats.docFreq); + // System.out.println("FPW.finishTerm docFreq=" + state.docFreq); // } // if (DEBUG) { @@ -389,7 +401,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it. final int singletonDocID; - if (stats.docFreq == 1) { + if (state.docFreq == 1) { // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq singletonDocID = docDeltaBuffer[0]; } else { @@ -420,8 +432,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { // totalTermFreq is just total number of positions(or payloads, or offsets) // associated with current term. - assert stats.totalTermFreq != -1; - if (stats.totalTermFreq > BLOCK_SIZE) { + assert state.totalTermFreq != -1; + if (state.totalTermFreq > BLOCK_SIZE) { // record file offset for last pos in last block lastPosBlockOffset = posOut.getFilePointer() - posTermStartFP; } else { @@ -486,7 +498,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { } } // if (DEBUG) { - // System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset); + // System.out.println(" totalTermFreq=" + state.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset); // } } else { lastPosBlockOffset = -1; @@ -505,76 +517,48 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { // System.out.println(" no skip: docCount=" + docCount); // } } - - long payStartFP; - if (stats.totalTermFreq >= BLOCK_SIZE) { - payStartFP = payTermStartFP; - } else { - payStartFP = -1; - } - // if (DEBUG) { // System.out.println(" payStartFP=" + payStartFP); // } - - pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID)); + state.docTermStartFP = docTermStartFP; + state.posTermStartFP = posTermStartFP; + state.payTermStartFP = payTermStartFP; + state.singletonDocID = singletonDocID; + state.skipOffset = skipOffset; + state.lastPosBlockOffset = lastPosBlockOffset; docBufferUpto = 0; posBufferUpto = 0; lastDocID = 0; docCount = 0; } - - private final RAMOutputStream bytesWriter = new RAMOutputStream(); + + // nocommit explain about the "don't care" values @Override - public void flushTermsBlock(int start, int count) throws IOException { - - if (count == 0) { - termsOut.writeByte((byte) 0); - return; + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + IntBlockTermState state = (IntBlockTermState)_state; + if (absolute) { + lastState = emptyState; } - - assert start <= pendingTerms.size(); - assert count <= start; - - final int limit = pendingTerms.size() - start + count; - - long lastDocStartFP = 0; - long lastPosStartFP = 0; - long lastPayStartFP = 0; - for(int idx=limit-count; idx= 0; storePayloads = fieldInfo.hasPayloads(); + lastState = emptyState; //System.out.println(" set init blockFreqStart=" + freqStart); //System.out.println(" set init blockProxStart=" + proxStart); + return 0; } int lastDocID; @@ -265,94 +276,48 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { public void finishDoc() { } - private static class PendingTerm { - public final long freqStart; - public final long proxStart; - public final long skipOffset; - - public PendingTerm(long freqStart, long proxStart, long skipOffset) { - this.freqStart = freqStart; - this.proxStart = proxStart; - this.skipOffset = skipOffset; - } + private static class StandardTermState extends BlockTermState { + public long freqStart; + public long proxStart; + public long skipOffset; } - private final List pendingTerms = new ArrayList(); - /** Called when we are done adding docs to this term */ @Override - public void finishTerm(TermStats stats) throws IOException { - + public void finishTerm(BlockTermState _state) throws IOException { + StandardTermState state = (StandardTermState)_state; // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart); - assert stats.docFreq > 0; + assert state.docFreq > 0; // TODO: wasteful we are counting this (counting # docs // for this term) in two places? - assert stats.docFreq == df; - - final long skipOffset; + assert state.docFreq == df; + state.freqStart = freqStart; + state.proxStart = proxStart; if (df >= skipMinimum) { - skipOffset = skipListWriter.writeSkip(freqOut)-freqStart; + state.skipOffset = skipListWriter.writeSkip(freqOut)-freqStart; } else { - skipOffset = -1; + state.skipOffset = -1; } - - pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset)); - lastDocID = 0; df = 0; } - private final RAMOutputStream bytesWriter = new RAMOutputStream(); - @Override - public void flushTermsBlock(int start, int count) throws IOException { - //if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size()); - - if (count == 0) { - termsOut.writeByte((byte) 0); - return; + public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + StandardTermState state = (StandardTermState)_state; + if (absolute) { + lastState = emptyState; } - - assert start <= pendingTerms.size(); - assert count <= start; - - final int limit = pendingTerms.size() - start + count; - final PendingTerm firstTerm = pendingTerms.get(limit - count); - // First term in block is abs coded: - bytesWriter.writeVLong(firstTerm.freqStart); - - if (firstTerm.skipOffset != -1) { - assert firstTerm.skipOffset > 0; - bytesWriter.writeVLong(firstTerm.skipOffset); + out.writeVLong(state.freqStart - lastState.freqStart); + if (state.skipOffset != -1) { + assert state.skipOffset > 0; + out.writeVLong(state.skipOffset); } if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { - bytesWriter.writeVLong(firstTerm.proxStart); + out.writeVLong(state.proxStart - lastState.proxStart); } - long lastFreqStart = firstTerm.freqStart; - long lastProxStart = firstTerm.proxStart; - for(int idx=limit-count+1; idx