From 1621816d81904e98ab807893e92cff648774aaa3 Mon Sep 17 00:00:00 2001
From: Han Jiang
Date: Fri, 23 Aug 2013 14:34:47 +0000
Subject: [PATCH] LUCENE-3069: merge 'temp' codes back
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1516860 13f79535-47bb-0310-9956-ffa450edef68
---
.../codecs/blockterms/BlockTermsReader.java | 44 +++--
.../codecs/blockterms/BlockTermsWriter.java | 51 +++--
.../codecs/pulsing/PulsingPostingsFormat.java | 4 +-
.../codecs/pulsing/PulsingPostingsReader.java | 94 +++++----
.../codecs/pulsing/PulsingPostingsWriter.java | 163 +++++++++------
.../lucene/codecs/sep/SepPostingsReader.java | 55 ++----
.../lucene/codecs/sep/SepPostingsWriter.java | 187 ++++++++----------
.../lucene/codecs/BlockTreeTermsReader.java | 89 ++++++---
.../lucene/codecs/BlockTreeTermsWriter.java | 126 ++++++++----
.../lucene/codecs/PostingsReaderBase.java | 8 +-
.../lucene/codecs/PostingsWriterBase.java | 31 +--
.../lucene40/Lucene40PostingsReader.java | 51 +----
.../lucene41/Lucene41PostingsReader.java | 93 ++-------
.../lucene41/Lucene41PostingsWriter.java | 166 +++++++---------
.../lucene40/Lucene40PostingsWriter.java | 109 ++++------
.../mockrandom/MockRandomPostingsFormat.java | 59 +++++-
.../NestedPulsingPostingsFormat.java | 8 +-
.../util/TestRuleSetupAndRestoreClassEnv.java | 2 +-
18 files changed, 698 insertions(+), 642 deletions(-)
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
index 7fa0e14d55c..2697cfebf97 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
@@ -142,6 +142,7 @@ public class BlockTermsReader extends FieldsProducer {
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
+ final int longsSize = in.readVInt();
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
}
@@ -151,7 +152,7 @@ public class BlockTermsReader extends FieldsProducer {
if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
}
- FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount));
+ FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize));
if (previous != null) {
throw new CorruptIndexException("duplicate fields: " + fieldInfo.name + " (resource=" + in + ")");
}
@@ -230,8 +231,9 @@ public class BlockTermsReader extends FieldsProducer {
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
+ final int longsSize;
- FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
@@ -239,6 +241,7 @@ public class BlockTermsReader extends FieldsProducer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
+ this.longsSize = longsSize;
}
@Override
@@ -326,6 +329,10 @@ public class BlockTermsReader extends FieldsProducer {
private final ByteArrayDataInput freqReader = new ByteArrayDataInput();
private int metaDataUpto;
+ private long[] longs;
+ private byte[] bytes;
+ private ByteArrayDataInput bytesReader;
+
public SegmentTermsEnum() throws IOException {
in = BlockTermsReader.this.in.clone();
in.seek(termsStartPointer);
@@ -339,6 +346,7 @@ public class BlockTermsReader extends FieldsProducer {
termSuffixes = new byte[128];
docFreqBytes = new byte[64];
//System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
+ longs = new long[longsSize];
}
@Override
@@ -415,7 +423,7 @@ public class BlockTermsReader extends FieldsProducer {
assert result;
indexIsCurrent = true;
- didIndexNext = false;
+ didIndexNext = false;
if (doOrd) {
state.ord = indexEnum.ord()-1;
@@ -789,12 +797,21 @@ public class BlockTermsReader extends FieldsProducer {
//System.out.println(" freq bytes len=" + len);
in.readBytes(docFreqBytes, 0, len);
freqReader.reset(docFreqBytes, 0, len);
+
+ // metadata
+ len = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(len, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < len) {
+ bytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ in.readBytes(bytes, 0, len);
+ bytesReader.reset(bytes, 0, len);
+
metaDataUpto = 0;
-
state.termBlockOrd = 0;
- postingsReader.readTermsBlock(in, fieldInfo, state);
-
indexIsCurrent = false;
//System.out.println(" indexIsCurrent=" + indexIsCurrent);
@@ -811,9 +828,7 @@ public class BlockTermsReader extends FieldsProducer {
// lazily catch up on metadata decode:
final int limit = state.termBlockOrd;
- // We must set/incr state.termCount because
- // postings impl can look at this
- state.termBlockOrd = metaDataUpto;
+ boolean absolute = metaDataUpto == 0;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
//System.out.println(" decode mdUpto=" + metaDataUpto);
@@ -825,16 +840,21 @@ public class BlockTermsReader extends FieldsProducer {
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // docFreq, totalTermFreq
state.docFreq = freqReader.readVInt();
//System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + freqReader.readVLong();
//System.out.println(" totTF=" + state.totalTermFreq);
}
-
- postingsReader.nextTerm(fieldInfo, state);
+ // metadata
+ for (int i = 0; i < longs.length; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
metaDataUpto++;
- state.termBlockOrd++;
+ absolute = false;
}
} else {
//System.out.println(" skip! seekPending");
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
index 367e45e2b1e..7871332b92b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
@@ -27,6 +27,7 @@ import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
@@ -77,8 +78,9 @@ public class BlockTermsWriter extends FieldsConsumer {
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
+ public final int longsSize;
- public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.termsStartPointer = termsStartPointer;
@@ -86,6 +88,7 @@ public class BlockTermsWriter extends FieldsConsumer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
+ this.longsSize = longsSize;
}
}
@@ -109,7 +112,7 @@ public class BlockTermsWriter extends FieldsConsumer {
//System.out.println("BTW.init seg=" + state.segmentName);
- postingsWriter.start(out); // have consumer write its format/header
+ postingsWriter.init(out); // have consumer write its format/header
success = true;
} finally {
if (!success) {
@@ -148,6 +151,7 @@ public class BlockTermsWriter extends FieldsConsumer {
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
+ out.writeVInt(field.longsSize);
}
writeTrailer(dirStart);
} finally {
@@ -161,7 +165,7 @@ public class BlockTermsWriter extends FieldsConsumer {
private static class TermEntry {
public final BytesRef term = new BytesRef();
- public TermStats stats;
+ public BlockTermState state;
}
class TermsWriter extends TermsConsumer {
@@ -173,6 +177,7 @@ public class BlockTermsWriter extends FieldsConsumer {
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
+ int longsSize;
private TermEntry[] pendingTerms;
@@ -190,8 +195,8 @@ public class BlockTermsWriter extends FieldsConsumer {
pendingTerms[i] = new TermEntry();
}
termsStartPointer = out.getFilePointer();
- postingsWriter.setField(fieldInfo);
this.postingsWriter = postingsWriter;
+ this.longsSize = postingsWriter.setField(fieldInfo);
}
@Override
@@ -237,11 +242,12 @@ public class BlockTermsWriter extends FieldsConsumer {
}
final TermEntry te = pendingTerms[pendingCount];
te.term.copyBytes(text);
- te.stats = stats;
+ te.state = postingsWriter.newTermState();
+ te.state.docFreq = stats.docFreq;
+ te.state.totalTermFreq = stats.totalTermFreq;
+ postingsWriter.finishTerm(te.state);
pendingCount++;
-
- postingsWriter.finishTerm(stats);
numTerms++;
}
@@ -264,7 +270,8 @@ public class BlockTermsWriter extends FieldsConsumer {
termsStartPointer,
sumTotalTermFreq,
sumDocFreq,
- docCount));
+ docCount,
+ longsSize));
}
}
@@ -285,6 +292,7 @@ public class BlockTermsWriter extends FieldsConsumer {
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
+ private final RAMOutputStream bufferWriter = new RAMOutputStream();
private void flushBlock() throws IOException {
//System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
@@ -318,19 +326,34 @@ public class BlockTermsWriter extends FieldsConsumer {
// TODO: cutover to better intblock codec. simple64?
// write prefix, suffix first:
for(int termCount=0;termCount fields;
- public PulsingPostingsReader(PostingsReaderBase wrappedPostingsReader) {
+ public PulsingPostingsReader(SegmentReadState state, PostingsReaderBase wrappedPostingsReader) {
this.wrappedPostingsReader = wrappedPostingsReader;
+ this.segmentState = state;
}
@Override
public void init(IndexInput termsIn) throws IOException {
- CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC,
- PulsingPostingsWriter.VERSION_START, PulsingPostingsWriter.VERSION_START);
+ version = CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC,
+ PulsingPostingsWriter.VERSION_START,
+ PulsingPostingsWriter.VERSION_CURRENT);
maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
+ if (wrappedPostingsReader instanceof PulsingPostingsReader ||
+ version < PulsingPostingsWriter.VERSION_META_ARRAY) {
+ fields = null;
+ } else {
+ fields = new TreeMap();
+ String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, PulsingPostingsWriter.SUMMARY_EXTENSION);
+ IndexInput in = null;
+ try {
+ in = segmentState.directory.openInput(summaryFileName, segmentState.context);
+ CodecUtil.checkHeader(in, PulsingPostingsWriter.CODEC, version,
+ PulsingPostingsWriter.VERSION_CURRENT);
+ int numField = in.readVInt();
+ for (int i = 0; i < numField; i++) {
+ int fieldNum = in.readVInt();
+ int longsSize = in.readVInt();
+ fields.put(fieldNum, longsSize);
+ }
+ } finally {
+ IOUtils.closeWhileHandlingException(in);
+ }
+ }
}
private static class PulsingTermState extends BlockTermState {
+ private boolean absolute = false;
+ private long[] longs;
private byte[] postings;
private int postingsSize; // -1 if this term was not inlined
private BlockTermState wrappedTermState;
- ByteArrayDataInput inlinedBytesReader;
- private byte[] inlinedBytes;
-
@Override
public PulsingTermState clone() {
PulsingTermState clone;
@@ -82,6 +112,11 @@ public class PulsingPostingsReader extends PostingsReaderBase {
} else {
assert wrappedTermState != null;
clone.wrappedTermState = (BlockTermState) wrappedTermState.clone();
+ clone.absolute = absolute;
+ if (longs != null) {
+ clone.longs = new long[longs.length];
+ System.arraycopy(longs, 0, clone.longs, 0, longs.length);
+ }
}
return clone;
}
@@ -99,11 +134,6 @@ public class PulsingPostingsReader extends PostingsReaderBase {
} else {
wrappedTermState.copyFrom(other.wrappedTermState);
}
-
- // NOTE: we do not copy the
- // inlinedBytes/inlinedBytesReader; these are only
- // stored on the "primary" TermState. They are
- // "transient" to cloned term states.
}
@Override
@@ -116,25 +146,6 @@ public class PulsingPostingsReader extends PostingsReaderBase {
}
}
- @Override
- public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
- //System.out.println("PR.readTermsBlock state=" + _termState);
- final PulsingTermState termState = (PulsingTermState) _termState;
- if (termState.inlinedBytes == null) {
- termState.inlinedBytes = new byte[128];
- termState.inlinedBytesReader = new ByteArrayDataInput();
- }
- int len = termsIn.readVInt();
- //System.out.println(" len=" + len + " fp=" + termsIn.getFilePointer());
- if (termState.inlinedBytes.length < len) {
- termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)];
- }
- termsIn.readBytes(termState.inlinedBytes, 0, len);
- termState.inlinedBytesReader.reset(termState.inlinedBytes);
- termState.wrappedTermState.termBlockOrd = 0;
- wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState);
- }
-
@Override
public BlockTermState newTermState() throws IOException {
PulsingTermState state = new PulsingTermState();
@@ -143,20 +154,20 @@ public class PulsingPostingsReader extends PostingsReaderBase {
}
@Override
- public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException {
//System.out.println("PR nextTerm");
PulsingTermState termState = (PulsingTermState) _termState;
-
+ assert empty.length == 0;
+ termState.absolute = termState.absolute || absolute;
// if we have positions, its total TF, otherwise its computed based on docFreq.
long count = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq;
//System.out.println(" count=" + count + " threshold=" + maxPositions);
if (count <= maxPositions) {
-
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum
// or D&PEnum is pulled):
- termState.postingsSize = termState.inlinedBytesReader.readVInt();
+ termState.postingsSize = in.readVInt();
if (termState.postings == null || termState.postings.length < termState.postingsSize) {
termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)];
}
@@ -164,16 +175,23 @@ public class PulsingPostingsReader extends PostingsReaderBase {
// (the blob holding all inlined terms' blobs for
// current term block) into another byte[] (just the
// blob for this term)...
- termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize);
+ in.readBytes(termState.postings, 0, termState.postingsSize);
//System.out.println(" inlined bytes=" + termState.postingsSize);
+ termState.absolute = termState.absolute || absolute;
} else {
//System.out.println(" not inlined");
+ final int longsSize = fields == null ? 0 : fields.get(fieldInfo.number);
+ if (termState.longs == null) {
+ termState.longs = new long[longsSize];
+ }
+ for (int i = 0; i < longsSize; i++) {
+ termState.longs[i] = in.readVLong();
+ }
termState.postingsSize = -1;
- // TODO: should we do full copyFrom? much heavier...?
termState.wrappedTermState.docFreq = termState.docFreq;
termState.wrappedTermState.totalTermFreq = termState.totalTermFreq;
- wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState);
- termState.wrappedTermState.termBlockOrd++;
+ wrappedPostingsReader.decodeTerm(termState.longs, in, fieldInfo, termState.wrappedTermState, termState.absolute);
+ termState.absolute = false;
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
index 6ba0ef698cc..1228aeaa511 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
@@ -21,14 +21,19 @@ import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
@@ -49,26 +54,43 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
final static String CODEC = "PulsedPostingsWriter";
+ // recording field summary
+ final static String SUMMARY_EXTENSION = "smy";
+
// To add a new version, increment from the last one, and
// change VERSION_CURRENT to point to your new version:
final static int VERSION_START = 0;
- final static int VERSION_CURRENT = VERSION_START;
+ final static int VERSION_META_ARRAY = 0;
+ final static int VERSION_CURRENT = VERSION_META_ARRAY;
+
+ private SegmentWriteState segmentState;
private IndexOutput termsOut;
+ private List fields;
+
private IndexOptions indexOptions;
private boolean storePayloads;
- private static class PendingTerm {
- private final byte[] bytes;
- public PendingTerm(byte[] bytes) {
- this.bytes = bytes;
+ // information for wrapped PF, in current field
+ private int longsSize;
+ private long[] longs;
+ boolean absolute;
+
+ private static class PulsingTermState extends BlockTermState {
+ private byte[] bytes;
+ private BlockTermState wrappedState;
+ @Override
+ public String toString() {
+ if (bytes != null) {
+ return "inlined";
+ } else {
+ return "not inlined wrapped=" + wrappedState;
+ }
}
}
- private final List pendingTerms = new ArrayList();
-
// one entry per position
private final Position[] pending;
private int pendingCount = 0; // -1 once we've hit too many positions
@@ -83,6 +105,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
int endOffset;
}
+ private static final class FieldMetaData {
+ int fieldNumber;
+ int longsSize;
+ FieldMetaData(int number, int size) {
+ fieldNumber = number;
+ longsSize = size;
+ }
+ }
+
// TODO: -- lazy init this? ie, if every single term
// was inlined (eg for a "primary key" field) then we
// never need to use this fallback? Fallback writer for
@@ -92,23 +123,33 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
/** If the total number of positions (summed across all docs
* for this term) is <= maxPositions, then the postings are
* inlined into terms dict */
- public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
+ public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
+
pending = new Position[maxPositions];
for(int i=0;i();
// We simply wrap another postings writer, but only call
// on it when tot positions is >= the cutoff:
this.wrappedPostingsWriter = wrappedPostingsWriter;
+ this.segmentState = state;
}
@Override
- public void start(IndexOutput termsOut) throws IOException {
+ public void init(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(pending.length); // encode maxPositions in header
- wrappedPostingsWriter.start(termsOut);
+ wrappedPostingsWriter.init(termsOut);
+ }
+
+ @Override
+ public BlockTermState newTermState() throws IOException {
+ PulsingTermState state = new PulsingTermState();
+ state.wrappedState = wrappedPostingsWriter.newTermState();
+ return state;
}
@Override
@@ -123,11 +164,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
- public void setField(FieldInfo fieldInfo) {
+ public int setField(FieldInfo fieldInfo) {
this.indexOptions = fieldInfo.getIndexOptions();
//if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
storePayloads = fieldInfo.hasPayloads();
- wrappedPostingsWriter.setField(fieldInfo);
+ absolute = false;
+ longsSize = wrappedPostingsWriter.setField(fieldInfo);
+ longs = new long[longsSize];
+ fields.add(new FieldMetaData(fieldInfo.number, longsSize));
+ return 0;
//DEBUG = BlockTreeTermsWriter.DEBUG;
}
@@ -219,18 +264,19 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(TermStats stats) throws IOException {
+ public void finishTerm(BlockTermState _state) throws IOException {
+ PulsingTermState state = (PulsingTermState) _state;
+
// if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());
assert pendingCount > 0 || pendingCount == -1;
if (pendingCount == -1) {
- wrappedPostingsWriter.finishTerm(stats);
- // Must add null entry to record terms that our
- // wrapped postings impl added
- pendingTerms.add(null);
+ state.wrappedState.docFreq = state.docFreq;
+ state.wrappedState.totalTermFreq = state.totalTermFreq;
+ state.bytes = null;
+ wrappedPostingsWriter.finishTerm(state.wrappedState);
} else {
-
// There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
@@ -325,61 +371,54 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
}
}
- final byte[] bytes = new byte[(int) buffer.getFilePointer()];
- buffer.writeTo(bytes, 0);
- pendingTerms.add(new PendingTerm(bytes));
+ state.bytes = new byte[(int) buffer.getFilePointer()];
+ buffer.writeTo(state.bytes, 0);
buffer.reset();
}
-
pendingCount = 0;
}
+ @Override
+ public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
+ PulsingTermState state = (PulsingTermState)_state;
+ assert empty.length == 0;
+ this.absolute = this.absolute || absolute;
+ if (state.bytes == null) {
+ wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute);
+ for (int i = 0; i < longsSize; i++) {
+ out.writeVLong(longs[i]);
+ }
+ buffer.writeTo(out);
+ buffer.reset();
+ this.absolute = false;
+ } else {
+ out.writeVInt(state.bytes.length);
+ out.writeBytes(state.bytes, 0, state.bytes.length);
+ this.absolute = this.absolute || absolute;
+ }
+ }
+
@Override
public void close() throws IOException {
wrappedPostingsWriter.close();
- }
-
- @Override
- public void flushTermsBlock(int start, int count) throws IOException {
- // if (DEBUG) System.out.println("PW: flushTermsBlock start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size());
- int wrappedCount = 0;
- assert buffer.getFilePointer() == 0;
- assert start >= count;
-
- final int limit = pendingTerms.size() - start + count;
-
- for(int idx=pendingTerms.size()-start; idx= 0) {
@@ -195,6 +197,24 @@ public final class SepPostingsWriter extends PostingsWriterBase {
}
skipListWriter.setIndexOptions(indexOptions);
storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads();
+ lastPayloadFP = 0;
+ lastSkipFP = 0;
+ lastState = setEmptyState();
+ return 0;
+ }
+
+ private SepTermState setEmptyState() {
+ SepTermState emptyState = new SepTermState();
+ emptyState.docIndex = docOut.index();
+ if (indexOptions != IndexOptions.DOCS_ONLY) {
+ emptyState.freqIndex = freqOut.index();
+ if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ emptyState.posIndex = posOut.index();
+ }
+ }
+ emptyState.payloadFP = 0;
+ emptyState.skipFP = 0;
+ return emptyState;
}
/** Adds a new doc in this term. If this returns null
@@ -262,135 +282,86 @@ public final class SepPostingsWriter extends PostingsWriterBase {
lastPosition = 0;
}
- private static class PendingTerm {
- public final IntIndexOutput.Index docIndex;
- public final IntIndexOutput.Index freqIndex;
- public final IntIndexOutput.Index posIndex;
+ private static class SepTermState extends BlockTermState {
+ public IntIndexOutput.Index docIndex;
+ public IntIndexOutput.Index freqIndex;
+ public IntIndexOutput.Index posIndex;
public long payloadFP;
public long skipFP;
-
- public PendingTerm(IntIndexOutput.Index docIndex, IntIndexOutput.Index freqIndex, IntIndexOutput.Index posIndex, long payloadFP, long skipFP) {
- this.docIndex = docIndex;
- this.freqIndex = freqIndex;
- this.posIndex = posIndex;
- this.payloadFP = payloadFP;
- this.skipFP = skipFP;
- }
}
- private final List pendingTerms = new ArrayList();
-
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(TermStats stats) throws IOException {
+ public void finishTerm(BlockTermState _state) throws IOException {
+ SepTermState state = (SepTermState)_state;
// TODO: -- wasteful we are counting this in two places?
- assert stats.docFreq > 0;
- assert stats.docFreq == df;
+ assert state.docFreq > 0;
+ assert state.docFreq == df;
- final IntIndexOutput.Index docIndexCopy = docOut.index();
- docIndexCopy.copyFrom(docIndex, false);
-
- final IntIndexOutput.Index freqIndexCopy;
- final IntIndexOutput.Index posIndexCopy;
+ state.docIndex = docOut.index();
+ state.docIndex.copyFrom(docIndex, false);
if (indexOptions != IndexOptions.DOCS_ONLY) {
- freqIndexCopy = freqOut.index();
- freqIndexCopy.copyFrom(freqIndex, false);
+ state.freqIndex = freqOut.index();
+ state.freqIndex.copyFrom(freqIndex, false);
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
- posIndexCopy = posOut.index();
- posIndexCopy.copyFrom(posIndex, false);
+ state.posIndex = posOut.index();
+ state.posIndex.copyFrom(posIndex, false);
} else {
- posIndexCopy = null;
+ state.posIndex = null;
}
} else {
- freqIndexCopy = null;
- posIndexCopy = null;
+ state.freqIndex = null;
+ state.posIndex = null;
}
- final long skipFP;
if (df >= skipMinimum) {
- skipFP = skipOut.getFilePointer();
+ state.skipFP = skipOut.getFilePointer();
//System.out.println(" skipFP=" + skipFP);
skipListWriter.writeSkip(skipOut);
//System.out.println(" numBytes=" + (skipOut.getFilePointer()-skipFP));
} else {
- skipFP = -1;
+ state.skipFP = -1;
}
+ state.payloadFP = payloadStart;
lastDocID = 0;
df = 0;
-
- pendingTerms.add(new PendingTerm(docIndexCopy,
- freqIndexCopy,
- posIndexCopy,
- payloadStart,
- skipFP));
}
@Override
- public void flushTermsBlock(int start, int count) throws IOException {
- //System.out.println("SEPW: flushTermsBlock: start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size() + " termsOut.fp=" + termsOut.getFilePointer());
- assert indexBytesWriter.getFilePointer() == 0;
- final int absStart = pendingTerms.size() - start;
- final List slice = pendingTerms.subList(absStart, absStart+count);
-
- if (count == 0) {
- termsOut.writeByte((byte) 0);
- return;
+ public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
+ SepTermState state = (SepTermState)_state;
+ if (absolute) {
+ lastSkipFP = 0;
+ lastPayloadFP = 0;
+ lastState = state;
}
-
- long lastSkipFP = 0;
- long lastPayloadFP = 0;
-
- boolean isFirstTerm = true;
-
- for(int idx=0;idx index;
+ final int longsSize;
+ private final FST index;
//private boolean DEBUG;
- FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException {
+ FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) throws IOException {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
@@ -462,6 +464,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
this.docCount = docCount;
this.indexStartFP = indexStartFP;
this.rootCode = rootCode;
+ this.longsSize = longsSize;
// if (DEBUG) {
// System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
// }
@@ -612,6 +615,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
FST.Arc arc;
final BlockTermState termState;
+
+ // metadata buffer, holding monotonical values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
// Cumulative output so far
BytesRef outputPrefix;
@@ -621,8 +630,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
public Frame(int ord) throws IOException {
this.ord = ord;
- termState = postingsReader.newTermState();
- termState.totalTermFreq = -1;
+ this.termState = postingsReader.newTermState();
+ this.termState.totalTermFreq = -1;
+ this.longs = new long[longsSize];
}
void loadNextFloorBlock() throws IOException {
@@ -720,8 +730,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
termState.termBlockOrd = 0;
nextEnt = 0;
-
- postingsReader.readTermsBlock(in, fieldInfo, termState);
+
+ // metadata
+ numBytes = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
if (!isLastInFloor) {
// Sub-blocks of a single floor block are always
@@ -774,12 +793,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
assert limit > 0;
- // We must set/incr state.termCount because
- // postings impl can look at this
- termState.termBlockOrd = metaDataUpto;
-
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@@ -791,17 +807,24 @@ public class BlockTreeTermsReader extends FieldsProducer {
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // stats
termState.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
+ // metadata
+ for (int i = 0; i < longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, termState, absolute);
- postingsReader.nextTerm(fieldInfo, termState);
metaDataUpto++;
- termState.termBlockOrd++;
+ absolute = false;
}
+ termState.termBlockOrd = metaDataUpto;
}
}
@@ -1707,6 +1730,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
}
+
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
// }
@@ -2290,10 +2314,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
final BlockTermState state;
+ // metadata buffer, holding monotonical values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
public Frame(int ord) throws IOException {
this.ord = ord;
- state = postingsReader.newTermState();
- state.totalTermFreq = -1;
+ this.state = postingsReader.newTermState();
+ this.state.totalTermFreq = -1;
+ this.longs = new long[longsSize];
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@@ -2391,7 +2422,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
// TODO: we could skip this if !hasTerms; but
// that's rare so won't help much
- postingsReader.readTermsBlock(in, fieldInfo, state);
+ // metadata
+ numBytes = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
+
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
@@ -2575,12 +2616,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
assert limit > 0;
- // We must set/incr state.termCount because
- // postings impl can look at this
- state.termBlockOrd = metaDataUpto;
-
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@@ -2592,17 +2630,24 @@ public class BlockTreeTermsReader extends FieldsProducer {
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // stats
state.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
+ // metadata
+ for (int i = 0; i < longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
- postingsReader.nextTerm(fieldInfo, state);
metaDataUpto++;
- state.termBlockOrd++;
+ absolute = false;
}
+ state.termBlockOrd = metaDataUpto;
}
// Used only by assert
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
index 0074894625c..bf1e1619427 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
@@ -104,13 +104,12 @@ import org.apache.lucene.util.packed.PackedInts;
* and decoding the Postings Metadata and Term Metadata sections.
*
*
- *
- * - TermsDict (.tim) --> Header, Postings Metadata, BlockNumBlocks,
+ *
- TermsDict (.tim) --> Header, Postings Header, NodeBlockNumBlocks,
* FieldSummary, DirOffset
- * - Block --> SuffixBlock, StatsBlock, MetadataBlock
- * - SuffixBlock --> EntryCount, SuffixLength, ByteSuffixLength
- * - StatsBlock --> StatsLength, <DocFreq, TotalTermFreq>EntryCount
- * - MetadataBlock --> MetaLength, <Term Metadata>EntryCount
+ * - NodeBlock --> (OuterNode | InnerNode)
+ * - OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <Term Metadata>EntryCount
+ * - InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, < TermStats ? >EntryCount, MetaLength, <Term Metadata ? >EntryCount
+ * - TermStats --> DocFreq, TotalTermFreq
* - FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength,
* SumDocFreq, DocCount>NumFields
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
@@ -136,7 +135,9 @@ import org.apache.lucene.util.packed.PackedInts;
* - DocCount is the number of documents that have at least one posting for this field.
* - PostingsMetadata and TermMetadata are plugged into by the specific postings implementation:
* these contain arbitrary per-file data (such as parameters or versioning information)
- * and per-term data (such as pointers to inverted files).
+ * and per-term data (such as pointers to inverted files).
+ * - For inner nodes of the tree, every entry will steal one bit to mark whether it points
+ * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted
*
*
* Term Index
@@ -237,8 +238,9 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
+ private final int longsSize;
- public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
@@ -248,6 +250,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
+ this.longsSize = longsSize;
}
}
@@ -300,7 +303,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// System.out.println("BTW.init seg=" + state.segmentName);
- postingsWriter.start(out); // have consumer write its format/header
+ postingsWriter.init(out); // have consumer write its format/header
success = true;
} finally {
if (!success) {
@@ -354,12 +357,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
private static final class PendingTerm extends PendingEntry {
public final BytesRef term;
- public final TermStats stats;
+ // stats + metadata
+ public final BlockTermState state;
- public PendingTerm(BytesRef term, TermStats stats) {
+ public PendingTerm(BytesRef term, BlockTermState state) {
super(true);
this.term = term;
- this.stats = stats;
+ this.state = state;
}
@Override
@@ -480,6 +484,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
class TermsWriter extends TermsConsumer {
private final FieldInfo fieldInfo;
+ private final int longsSize;
private long numTerms;
long sumTotalTermFreq;
long sumDocFreq;
@@ -839,11 +844,16 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
final List> subIndices;
int termCount;
+
+ long[] longs = new long[longsSize];
+ boolean absolute = true;
+
if (isLeafBlock) {
subIndices = null;
for (PendingEntry ent : slice) {
assert ent.isTerm;
PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
final int suffix = term.term.length - prefixLength;
// if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(suffix);
@@ -852,15 +862,25 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// System.out.println(" write term suffix=" + suffixBytes);
// }
// For leaf block we write suffix straight
- bytesWriter.writeVInt(suffix);
- bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+ suffixWriter.writeVInt(suffix);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
- bytesWriter2.writeVInt(term.stats.docFreq);
+ statsWriter.writeVInt(state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
- assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
- bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
+ assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
}
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
}
termCount = length;
} else {
@@ -869,6 +889,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
for (PendingEntry ent : slice) {
if (ent.isTerm) {
PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
final int suffix = term.term.length - prefixLength;
// if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(suffix);
@@ -878,16 +899,34 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// }
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block
- bytesWriter.writeVInt(suffix<<1);
- bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+ suffixWriter.writeVInt(suffix<<1);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
- bytesWriter2.writeVInt(term.stats.docFreq);
+ statsWriter.writeVInt(state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
- assert term.stats.totalTermFreq >= term.stats.docFreq;
- bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
+ assert state.totalTermFreq >= state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
}
+ // TODO: now that terms dict "sees" these longs,
+ // we can explore better column-stride encodings
+ // to encode all long[0]s for this block at
+ // once, all long[1]s, etc., e.g. using
+ // Simple64. Alternatively, we could interleave
+ // stats + meta ... no reason to have them
+ // separate anymore:
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
+
termCount++;
} else {
PendingBlock block = (PendingBlock) ent;
@@ -897,8 +936,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block
- bytesWriter.writeVInt((suffix<<1)|1);
- bytesWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
+ suffixWriter.writeVInt((suffix<<1)|1);
+ suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
assert block.fp < startFP;
// if (DEBUG) {
@@ -908,7 +947,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
// }
- bytesWriter.writeVLong(startFP - block.fp);
+ suffixWriter.writeVLong(startFP - block.fp);
subIndices.add(block.index);
}
}
@@ -921,17 +960,19 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// search on lookup
// Write suffixes byte[] blob to terms dict output:
- out.writeVInt((int) (bytesWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
- bytesWriter.writeTo(out);
- bytesWriter.reset();
+ out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
+ suffixWriter.writeTo(out);
+ suffixWriter.reset();
// Write term stats byte[] blob
- out.writeVInt((int) bytesWriter2.getFilePointer());
- bytesWriter2.writeTo(out);
- bytesWriter2.reset();
+ out.writeVInt((int) statsWriter.getFilePointer());
+ statsWriter.writeTo(out);
+ statsWriter.reset();
- // Have postings writer write block
- postingsWriter.flushTermsBlock(futureTermCount+termCount, termCount);
+ // Write term meta data byte[] blob
+ out.writeVInt((int) metaWriter.getFilePointer());
+ metaWriter.writeTo(out);
+ metaWriter.reset();
// Remove slice replaced by block:
slice.clear();
@@ -967,7 +1008,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
PackedInts.COMPACT,
true, 15);
- postingsWriter.setField(fieldInfo);
+ this.longsSize = postingsWriter.setField(fieldInfo);
}
@Override
@@ -998,8 +1039,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
//if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
- pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats));
- postingsWriter.finishTerm(stats);
+ BlockTermState state = postingsWriter.newTermState();
+ state.docFreq = stats.docFreq;
+ state.totalTermFreq = stats.totalTermFreq;
+ postingsWriter.finishTerm(state);
+
+ PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+ pending.add(term);
numTerms++;
}
@@ -1038,7 +1084,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
- docCount));
+ docCount,
+ longsSize));
} else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
assert sumDocFreq == 0;
@@ -1046,8 +1093,10 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
}
}
+ private final RAMOutputStream suffixWriter = new RAMOutputStream();
+ private final RAMOutputStream statsWriter = new RAMOutputStream();
+ private final RAMOutputStream metaWriter = new RAMOutputStream();
private final RAMOutputStream bytesWriter = new RAMOutputStream();
- private final RAMOutputStream bytesWriter2 = new RAMOutputStream();
}
@Override
@@ -1072,6 +1121,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
+ out.writeVInt(field.longsSize);
indexOut.writeVLong(field.indexStartFP);
}
writeTrailer(out, dirStart);
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java
index b8ea7f261ae..58c7a87253c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java
@@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Bits;
/** The core terms dictionaries (BlockTermsReader,
@@ -55,7 +56,7 @@ public abstract class PostingsReaderBase implements Closeable {
public abstract BlockTermState newTermState() throws IOException;
/** Actually decode metadata for next term */
- public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException;
+ public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
@@ -68,9 +69,4 @@ public abstract class PostingsReaderBase implements Closeable {
@Override
public abstract void close() throws IOException;
-
- /** Reads data for all terms in the next block; this
- * method should merely load the byte[] blob but not
- * decode, which is done in {@link #nextTerm}. */
- public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
index 0ed53e754de..3aeb3d3dccc 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
@@ -20,6 +20,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import java.io.Closeable;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfo;
@@ -48,25 +49,31 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
/** Called once after startup, before any terms have been
* added. Implementations typically write a header to
* the provided {@code termsOut}. */
- public abstract void start(IndexOutput termsOut) throws IOException;
+ public abstract void init(IndexOutput termsOut) throws IOException;
+
+ /** Return a newly created empty TermState */
+ public abstract BlockTermState newTermState() throws IOException;
/** Start a new term. Note that a matching call to {@link
- * #finishTerm(TermStats)} is done, only if the term has at least one
+ * #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one
* document. */
public abstract void startTerm() throws IOException;
- /** Flush count terms starting at start "backwards", as a
- * block. start is a negative offset from the end of the
- * terms stack, ie bigger start means further back in
- * the stack. */
- public abstract void flushTermsBlock(int start, int count) throws IOException;
-
/** Finishes the current term. The provided {@link
- * TermStats} contains the term's summary statistics. */
- public abstract void finishTerm(TermStats stats) throws IOException;
+ * BlockTermState} contains the term's summary statistics,
+ * and will holds metadata from PBF when returned */
+ public abstract void finishTerm(BlockTermState state) throws IOException;
- /** Called when the writing switches to another field. */
- public abstract void setField(FieldInfo fieldInfo);
+ /**
+ * Encode metadata as long[] and byte[]. {@code absolute} controls
+ * whether current term is delta encoded according to latest term.
+ */
+ public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+ /**
+ * Return the fixed length of longs,
+ * called when the writing switches to another field. */
+ public abstract int setField(FieldInfo fieldInfo);
@Override
public abstract void close() throws IOException;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
index 9c2c86fc3fc..016cff7c332 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
@@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@@ -121,11 +122,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
long proxOffset;
long skipOffset;
- // Only used by the "primary" TermState -- clones don't
- // copy this (basically they are "transient"):
- ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...?
- byte[] bytes;
-
@Override
public StandardTermState clone() {
StandardTermState other = new StandardTermState();
@@ -140,11 +136,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
freqOffset = other.freqOffset;
proxOffset = other.proxOffset;
skipOffset = other.skipOffset;
-
- // Do not copy bytes, bytesReader (else TermState is
- // very heavy, ie drags around the entire block's
- // byte[]). On seek back, if next() is in fact used
- // (rare!), they will be re-read from disk.
}
@Override
@@ -171,38 +162,18 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
}
- /* Reads but does not decode the byte[] blob holding
- metadata for the current terms block */
@Override
- public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
- final StandardTermState termState = (StandardTermState) _termState;
-
- final int len = termsIn.readVInt();
-
- // if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState);
- if (termState.bytes == null) {
- termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
- termState.bytesReader = new ByteArrayDataInput();
- } else if (termState.bytes.length < len) {
- termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
- }
-
- termsIn.readBytes(termState.bytes, 0, len);
- termState.bytesReader.reset(termState.bytes, 0, len);
- }
-
- @Override
- public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
+ public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
throws IOException {
final StandardTermState termState = (StandardTermState) _termState;
// if (DEBUG) System.out.println("SPR: nextTerm seg=" + segment + " tbOrd=" + termState.termBlockOrd + " bytesReader.fp=" + termState.bytesReader.getPosition());
final boolean isFirstTerm = termState.termBlockOrd == 0;
-
- if (isFirstTerm) {
- termState.freqOffset = termState.bytesReader.readVLong();
- } else {
- termState.freqOffset += termState.bytesReader.readVLong();
+ if (absolute) {
+ termState.freqOffset = 0;
+ termState.proxOffset = 0;
}
+
+ termState.freqOffset += in.readVLong();
/*
if (DEBUG) {
System.out.println(" dF=" + termState.docFreq);
@@ -212,7 +183,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
assert termState.freqOffset < freqIn.length();
if (termState.docFreq >= skipMinimum) {
- termState.skipOffset = termState.bytesReader.readVLong();
+ termState.skipOffset = in.readVLong();
// if (DEBUG) System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length());
assert termState.freqOffset + termState.skipOffset < freqIn.length();
} else {
@@ -220,11 +191,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- if (isFirstTerm) {
- termState.proxOffset = termState.bytesReader.readVLong();
- } else {
- termState.proxOffset += termState.bytesReader.readVLong();
- }
+ termState.proxOffset += in.readVLong();
// if (DEBUG) System.out.println(" proxFP=" + termState.proxOffset);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
index 500ab204d55..92b4880ac4b 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@@ -152,11 +152,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// freq is always implicitly totalTermFreq in this case.
int singletonDocID;
- // Only used by the "primary" TermState -- clones don't
- // copy this (basically they are "transient"):
- ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...?
- byte[] bytes;
-
@Override
public IntBlockTermState clone() {
IntBlockTermState other = new IntBlockTermState();
@@ -174,11 +169,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset;
singletonDocID = other.singletonDocID;
-
- // Do not copy bytes, bytesReader (else TermState is
- // very heavy, ie drags around the entire block's
- // byte[]). On seek back, if next() is in fact used
- // (rare!), they will be re-read from disk.
}
@Override
@@ -197,78 +187,37 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
IOUtils.close(docIn, posIn, payIn);
}
- /* Reads but does not decode the byte[] blob holding
- metadata for the current terms block */
@Override
- public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
- final IntBlockTermState termState = (IntBlockTermState) _termState;
-
- final int numBytes = termsIn.readVInt();
-
- if (termState.bytes == null) {
- termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
- termState.bytesReader = new ByteArrayDataInput();
- } else if (termState.bytes.length < numBytes) {
- termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
- }
-
- termsIn.readBytes(termState.bytes, 0, numBytes);
- termState.bytesReader.reset(termState.bytes, 0, numBytes);
- }
-
- @Override
- public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
+ public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
throws IOException {
final IntBlockTermState termState = (IntBlockTermState) _termState;
- final boolean isFirstTerm = termState.termBlockOrd == 0;
final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean fieldHasPayloads = fieldInfo.hasPayloads();
- final DataInput in = termState.bytesReader;
- if (isFirstTerm) {
- if (termState.docFreq == 1) {
- termState.singletonDocID = in.readVInt();
- termState.docStartFP = 0;
- } else {
- termState.singletonDocID = -1;
- termState.docStartFP = in.readVLong();
- }
- if (fieldHasPositions) {
- termState.posStartFP = in.readVLong();
- if (termState.totalTermFreq > BLOCK_SIZE) {
- termState.lastPosBlockOffset = in.readVLong();
- } else {
- termState.lastPosBlockOffset = -1;
- }
- if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
- termState.payStartFP = in.readVLong();
- } else {
- termState.payStartFP = -1;
- }
+ // nocommit: use old version
+ if (absolute) {
+ termState.docStartFP = 0;
+ termState.posStartFP = 0;
+ termState.payStartFP = 0;
+ }
+ termState.docStartFP += longs[0];
+ if (fieldHasPositions) {
+ termState.posStartFP += longs[1];
+ if (fieldHasOffsets || fieldHasPayloads) {
+ termState.payStartFP += longs[2];
}
+ }
+ if (termState.docFreq == 1) {
+ termState.singletonDocID = in.readVInt();
} else {
- if (termState.docFreq == 1) {
- termState.singletonDocID = in.readVInt();
+ termState.singletonDocID = -1;
+ }
+ if (fieldHasPositions) {
+ if (termState.totalTermFreq > BLOCK_SIZE) {
+ termState.lastPosBlockOffset = in.readVLong();
} else {
- termState.singletonDocID = -1;
- termState.docStartFP += in.readVLong();
- }
- if (fieldHasPositions) {
- termState.posStartFP += in.readVLong();
- if (termState.totalTermFreq > BLOCK_SIZE) {
- termState.lastPosBlockOffset = in.readVLong();
- } else {
- termState.lastPosBlockOffset = -1;
- }
- if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
- long delta = in.readVLong();
- if (termState.payStartFP == -1) {
- termState.payStartFP = delta;
- } else {
- termState.payStartFP += delta;
- }
- }
+ termState.lastPosBlockOffset = -1;
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
index 9e8728ff979..e020fc4ff92 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
@@ -25,14 +25,15 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
@@ -71,7 +72,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
final IndexOutput posOut;
final IndexOutput payOut;
- private IndexOutput termsOut;
+ final static IntBlockTermState emptyState = new IntBlockTermState();
+ IntBlockTermState lastState;
// How current field indexes postings:
private boolean fieldHasFreqs;
@@ -79,7 +81,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
- // Holds starting file pointers for each term:
+ // Holds starting file pointers for current term:
private long docTermStartFP;
private long posTermStartFP;
private long payTermStartFP;
@@ -188,21 +190,50 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
this(state, PackedInts.COMPACT);
}
+ private final static class IntBlockTermState extends BlockTermState {
+ long docTermStartFP = 0;
+ long posTermStartFP = 0;
+ long payTermStartFP = 0;
+ long skipOffset = -1;
+ long lastPosBlockOffset = -1;
+ int singletonDocID = -1;
+ @Override
+ public String toString() {
+ return super.toString() + " docStartFP=" + docTermStartFP + " posStartFP=" + posTermStartFP + " payStartFP=" + payTermStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
+ }
+ }
+
@Override
- public void start(IndexOutput termsOut) throws IOException {
- this.termsOut = termsOut;
+ public IntBlockTermState newTermState() {
+ return new IntBlockTermState();
+ }
+
+ @Override
+ public void init(IndexOutput termsOut) throws IOException {
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(BLOCK_SIZE);
}
+ // nocommit better name?
+
@Override
- public void setField(FieldInfo fieldInfo) {
+ public int setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
fieldHasPayloads = fieldInfo.hasPayloads();
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
+ lastState = emptyState;
+ if (fieldHasPositions) {
+ if (fieldHasPayloads || fieldHasOffsets) {
+ return 3; // doc + pos + pay FP
+ } else {
+ return 2; // doc + pos FP
+ }
+ } else {
+ return 1; // doc FP
+ }
}
@Override
@@ -348,37 +379,18 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
}
}
- private static class PendingTerm {
- public final long docStartFP;
- public final long posStartFP;
- public final long payStartFP;
- public final long skipOffset;
- public final long lastPosBlockOffset;
- public final int singletonDocID;
-
- public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
- this.docStartFP = docStartFP;
- this.posStartFP = posStartFP;
- this.payStartFP = payStartFP;
- this.skipOffset = skipOffset;
- this.lastPosBlockOffset = lastPosBlockOffset;
- this.singletonDocID = singletonDocID;
- }
- }
-
- private final List pendingTerms = new ArrayList();
-
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(TermStats stats) throws IOException {
- assert stats.docFreq > 0;
+ public void finishTerm(BlockTermState _state) throws IOException {
+ IntBlockTermState state = (IntBlockTermState) _state;
+ assert state.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
- assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
+ assert state.docFreq == docCount: state.docFreq + " vs " + docCount;
// if (DEBUG) {
- // System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
+ // System.out.println("FPW.finishTerm docFreq=" + state.docFreq);
// }
// if (DEBUG) {
@@ -389,7 +401,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
final int singletonDocID;
- if (stats.docFreq == 1) {
+ if (state.docFreq == 1) {
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
singletonDocID = docDeltaBuffer[0];
} else {
@@ -420,8 +432,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// totalTermFreq is just total number of positions(or payloads, or offsets)
// associated with current term.
- assert stats.totalTermFreq != -1;
- if (stats.totalTermFreq > BLOCK_SIZE) {
+ assert state.totalTermFreq != -1;
+ if (state.totalTermFreq > BLOCK_SIZE) {
// record file offset for last pos in last block
lastPosBlockOffset = posOut.getFilePointer() - posTermStartFP;
} else {
@@ -486,7 +498,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
}
}
// if (DEBUG) {
- // System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
+ // System.out.println(" totalTermFreq=" + state.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
// }
} else {
lastPosBlockOffset = -1;
@@ -505,76 +517,48 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// System.out.println(" no skip: docCount=" + docCount);
// }
}
-
- long payStartFP;
- if (stats.totalTermFreq >= BLOCK_SIZE) {
- payStartFP = payTermStartFP;
- } else {
- payStartFP = -1;
- }
-
// if (DEBUG) {
// System.out.println(" payStartFP=" + payStartFP);
// }
-
- pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
+ state.docTermStartFP = docTermStartFP;
+ state.posTermStartFP = posTermStartFP;
+ state.payTermStartFP = payTermStartFP;
+ state.singletonDocID = singletonDocID;
+ state.skipOffset = skipOffset;
+ state.lastPosBlockOffset = lastPosBlockOffset;
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
-
- private final RAMOutputStream bytesWriter = new RAMOutputStream();
+
+ // nocommit explain about the "don't care" values
@Override
- public void flushTermsBlock(int start, int count) throws IOException {
-
- if (count == 0) {
- termsOut.writeByte((byte) 0);
- return;
+ public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
+ IntBlockTermState state = (IntBlockTermState)_state;
+ if (absolute) {
+ lastState = emptyState;
}
-
- assert start <= pendingTerms.size();
- assert count <= start;
-
- final int limit = pendingTerms.size() - start + count;
-
- long lastDocStartFP = 0;
- long lastPosStartFP = 0;
- long lastPayStartFP = 0;
- for(int idx=limit-count; idx= 0;
storePayloads = fieldInfo.hasPayloads();
+ lastState = emptyState;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
+ return 0;
}
int lastDocID;
@@ -265,94 +276,48 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
public void finishDoc() {
}
- private static class PendingTerm {
- public final long freqStart;
- public final long proxStart;
- public final long skipOffset;
-
- public PendingTerm(long freqStart, long proxStart, long skipOffset) {
- this.freqStart = freqStart;
- this.proxStart = proxStart;
- this.skipOffset = skipOffset;
- }
+ private static class StandardTermState extends BlockTermState {
+ public long freqStart;
+ public long proxStart;
+ public long skipOffset;
}
- private final List pendingTerms = new ArrayList();
-
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(TermStats stats) throws IOException {
-
+ public void finishTerm(BlockTermState _state) throws IOException {
+ StandardTermState state = (StandardTermState)_state;
// if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
- assert stats.docFreq > 0;
+ assert state.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
- assert stats.docFreq == df;
-
- final long skipOffset;
+ assert state.docFreq == df;
+ state.freqStart = freqStart;
+ state.proxStart = proxStart;
if (df >= skipMinimum) {
- skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
+ state.skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
} else {
- skipOffset = -1;
+ state.skipOffset = -1;
}
-
- pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset));
-
lastDocID = 0;
df = 0;
}
- private final RAMOutputStream bytesWriter = new RAMOutputStream();
-
@Override
- public void flushTermsBlock(int start, int count) throws IOException {
- //if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size());
-
- if (count == 0) {
- termsOut.writeByte((byte) 0);
- return;
+ public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
+ StandardTermState state = (StandardTermState)_state;
+ if (absolute) {
+ lastState = emptyState;
}
-
- assert start <= pendingTerms.size();
- assert count <= start;
-
- final int limit = pendingTerms.size() - start + count;
- final PendingTerm firstTerm = pendingTerms.get(limit - count);
- // First term in block is abs coded:
- bytesWriter.writeVLong(firstTerm.freqStart);
-
- if (firstTerm.skipOffset != -1) {
- assert firstTerm.skipOffset > 0;
- bytesWriter.writeVLong(firstTerm.skipOffset);
+ out.writeVLong(state.freqStart - lastState.freqStart);
+ if (state.skipOffset != -1) {
+ assert state.skipOffset > 0;
+ out.writeVLong(state.skipOffset);
}
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- bytesWriter.writeVLong(firstTerm.proxStart);
+ out.writeVLong(state.proxStart - lastState.proxStart);
}
- long lastFreqStart = firstTerm.freqStart;
- long lastProxStart = firstTerm.proxStart;
- for(int idx=limit-count+1; idx