mirror of https://github.com/apache/lucene.git
LUCENE-3069: merge 'temp' codes back
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1516860 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2fc580e715
commit
1621816d81
|
@ -142,6 +142,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
|
||||
final long sumDocFreq = in.readVLong();
|
||||
final int docCount = in.readVInt();
|
||||
final int longsSize = in.readVInt();
|
||||
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
|
||||
}
|
||||
|
@ -151,7 +152,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
|
||||
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
|
||||
}
|
||||
FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount));
|
||||
FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize));
|
||||
if (previous != null) {
|
||||
throw new CorruptIndexException("duplicate fields: " + fieldInfo.name + " (resource=" + in + ")");
|
||||
}
|
||||
|
@ -230,8 +231,9 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
final long sumTotalTermFreq;
|
||||
final long sumDocFreq;
|
||||
final int docCount;
|
||||
final int longsSize;
|
||||
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.numTerms = numTerms;
|
||||
|
@ -239,6 +241,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
this.longsSize = longsSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -326,6 +329,10 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
private final ByteArrayDataInput freqReader = new ByteArrayDataInput();
|
||||
private int metaDataUpto;
|
||||
|
||||
private long[] longs;
|
||||
private byte[] bytes;
|
||||
private ByteArrayDataInput bytesReader;
|
||||
|
||||
public SegmentTermsEnum() throws IOException {
|
||||
in = BlockTermsReader.this.in.clone();
|
||||
in.seek(termsStartPointer);
|
||||
|
@ -339,6 +346,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
termSuffixes = new byte[128];
|
||||
docFreqBytes = new byte[64];
|
||||
//System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
|
||||
longs = new long[longsSize];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -415,7 +423,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
assert result;
|
||||
|
||||
indexIsCurrent = true;
|
||||
didIndexNext = false;
|
||||
didIndexNext = false;
|
||||
|
||||
if (doOrd) {
|
||||
state.ord = indexEnum.ord()-1;
|
||||
|
@ -789,12 +797,21 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
//System.out.println(" freq bytes len=" + len);
|
||||
in.readBytes(docFreqBytes, 0, len);
|
||||
freqReader.reset(docFreqBytes, 0, len);
|
||||
|
||||
// metadata
|
||||
len = in.readVInt();
|
||||
if (bytes == null) {
|
||||
bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
bytesReader = new ByteArrayDataInput();
|
||||
} else if (bytes.length < len) {
|
||||
bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
}
|
||||
in.readBytes(bytes, 0, len);
|
||||
bytesReader.reset(bytes, 0, len);
|
||||
|
||||
metaDataUpto = 0;
|
||||
|
||||
state.termBlockOrd = 0;
|
||||
|
||||
postingsReader.readTermsBlock(in, fieldInfo, state);
|
||||
|
||||
indexIsCurrent = false;
|
||||
//System.out.println(" indexIsCurrent=" + indexIsCurrent);
|
||||
|
||||
|
@ -811,9 +828,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
|
||||
// lazily catch up on metadata decode:
|
||||
final int limit = state.termBlockOrd;
|
||||
// We must set/incr state.termCount because
|
||||
// postings impl can look at this
|
||||
state.termBlockOrd = metaDataUpto;
|
||||
boolean absolute = metaDataUpto == 0;
|
||||
// TODO: better API would be "jump straight to term=N"???
|
||||
while (metaDataUpto < limit) {
|
||||
//System.out.println(" decode mdUpto=" + metaDataUpto);
|
||||
|
@ -825,16 +840,21 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
|
||||
// TODO: if docFreq were bulk decoded we could
|
||||
// just skipN here:
|
||||
|
||||
// docFreq, totalTermFreq
|
||||
state.docFreq = freqReader.readVInt();
|
||||
//System.out.println(" dF=" + state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
state.totalTermFreq = state.docFreq + freqReader.readVLong();
|
||||
//System.out.println(" totTF=" + state.totalTermFreq);
|
||||
}
|
||||
|
||||
postingsReader.nextTerm(fieldInfo, state);
|
||||
// metadata
|
||||
for (int i = 0; i < longs.length; i++) {
|
||||
longs[i] = bytesReader.readVLong();
|
||||
}
|
||||
postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
|
||||
metaDataUpto++;
|
||||
state.termBlockOrd++;
|
||||
absolute = false;
|
||||
}
|
||||
} else {
|
||||
//System.out.println(" skip! seekPending");
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.codecs.FieldsConsumer;
|
|||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
|
@ -77,8 +78,9 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
public final long sumTotalTermFreq;
|
||||
public final long sumDocFreq;
|
||||
public final int docCount;
|
||||
public final int longsSize;
|
||||
|
||||
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
|
||||
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.termsStartPointer = termsStartPointer;
|
||||
|
@ -86,6 +88,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
this.longsSize = longsSize;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -109,7 +112,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
|
||||
//System.out.println("BTW.init seg=" + state.segmentName);
|
||||
|
||||
postingsWriter.start(out); // have consumer write its format/header
|
||||
postingsWriter.init(out); // have consumer write its format/header
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -148,6 +151,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
out.writeVLong(field.sumDocFreq);
|
||||
out.writeVInt(field.docCount);
|
||||
out.writeVInt(field.longsSize);
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
} finally {
|
||||
|
@ -161,7 +165,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
|
||||
private static class TermEntry {
|
||||
public final BytesRef term = new BytesRef();
|
||||
public TermStats stats;
|
||||
public BlockTermState state;
|
||||
}
|
||||
|
||||
class TermsWriter extends TermsConsumer {
|
||||
|
@ -173,6 +177,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
long sumTotalTermFreq;
|
||||
long sumDocFreq;
|
||||
int docCount;
|
||||
int longsSize;
|
||||
|
||||
private TermEntry[] pendingTerms;
|
||||
|
||||
|
@ -190,8 +195,8 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
pendingTerms[i] = new TermEntry();
|
||||
}
|
||||
termsStartPointer = out.getFilePointer();
|
||||
postingsWriter.setField(fieldInfo);
|
||||
this.postingsWriter = postingsWriter;
|
||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -237,11 +242,12 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
final TermEntry te = pendingTerms[pendingCount];
|
||||
te.term.copyBytes(text);
|
||||
te.stats = stats;
|
||||
te.state = postingsWriter.newTermState();
|
||||
te.state.docFreq = stats.docFreq;
|
||||
te.state.totalTermFreq = stats.totalTermFreq;
|
||||
postingsWriter.finishTerm(te.state);
|
||||
|
||||
pendingCount++;
|
||||
|
||||
postingsWriter.finishTerm(stats);
|
||||
numTerms++;
|
||||
}
|
||||
|
||||
|
@ -264,7 +270,8 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
termsStartPointer,
|
||||
sumTotalTermFreq,
|
||||
sumDocFreq,
|
||||
docCount));
|
||||
docCount,
|
||||
longsSize));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -285,6 +292,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream bufferWriter = new RAMOutputStream();
|
||||
|
||||
private void flushBlock() throws IOException {
|
||||
//System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
|
||||
|
@ -318,19 +326,34 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
// TODO: cutover to better intblock codec. simple64?
|
||||
// write prefix, suffix first:
|
||||
for(int termCount=0;termCount<pendingCount;termCount++) {
|
||||
final TermStats stats = pendingTerms[termCount].stats;
|
||||
assert stats != null;
|
||||
bytesWriter.writeVInt(stats.docFreq);
|
||||
final BlockTermState state = pendingTerms[termCount].state;
|
||||
assert state != null;
|
||||
bytesWriter.writeVInt(state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
bytesWriter.writeVLong(stats.totalTermFreq-stats.docFreq);
|
||||
bytesWriter.writeVLong(state.totalTermFreq-state.docFreq);
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(out);
|
||||
bytesWriter.reset();
|
||||
|
||||
postingsWriter.flushTermsBlock(pendingCount, pendingCount);
|
||||
// 4th pass: write the metadata
|
||||
long[] longs = new long[longsSize];
|
||||
boolean absolute = true;
|
||||
for(int termCount=0;termCount<pendingCount;termCount++) {
|
||||
final BlockTermState state = pendingTerms[termCount].state;
|
||||
postingsWriter.encodeTerm(longs, bufferWriter, fieldInfo, state, absolute);
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
bytesWriter.writeVLong(longs[i]);
|
||||
}
|
||||
bufferWriter.writeTo(bytesWriter);
|
||||
bufferWriter.reset();
|
||||
absolute = false;
|
||||
}
|
||||
out.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(out);
|
||||
bytesWriter.reset();
|
||||
|
||||
lastPrevTerm.copyBytes(pendingTerms[pendingCount-1].term);
|
||||
pendingCount = 0;
|
||||
}
|
||||
|
|
|
@ -79,7 +79,7 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
|
|||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
|
||||
pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
|
@ -98,7 +98,7 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
|
|||
boolean success = false;
|
||||
try {
|
||||
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
pulsingReader = new PulsingPostingsReader(docsReader);
|
||||
pulsingReader = new PulsingPostingsReader(state, docsReader);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.directory, state.fieldInfos, state.segmentInfo,
|
||||
pulsingReader,
|
||||
|
|
|
@ -20,16 +20,20 @@ package org.apache.lucene.codecs.pulsing;
|
|||
import java.io.IOException;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
@ -37,6 +41,7 @@ import org.apache.lucene.util.AttributeImpl;
|
|||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Concrete class that reads the current doc/freq/skip
|
||||
* postings format
|
||||
|
@ -50,28 +55,53 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
|
||||
// Fallback reader for non-pulsed terms:
|
||||
final PostingsReaderBase wrappedPostingsReader;
|
||||
final SegmentReadState segmentState;
|
||||
int maxPositions;
|
||||
int version;
|
||||
TreeMap<Integer, Integer> fields;
|
||||
|
||||
public PulsingPostingsReader(PostingsReaderBase wrappedPostingsReader) {
|
||||
public PulsingPostingsReader(SegmentReadState state, PostingsReaderBase wrappedPostingsReader) {
|
||||
this.wrappedPostingsReader = wrappedPostingsReader;
|
||||
this.segmentState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexInput termsIn) throws IOException {
|
||||
CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC,
|
||||
PulsingPostingsWriter.VERSION_START, PulsingPostingsWriter.VERSION_START);
|
||||
version = CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC,
|
||||
PulsingPostingsWriter.VERSION_START,
|
||||
PulsingPostingsWriter.VERSION_CURRENT);
|
||||
maxPositions = termsIn.readVInt();
|
||||
wrappedPostingsReader.init(termsIn);
|
||||
if (wrappedPostingsReader instanceof PulsingPostingsReader ||
|
||||
version < PulsingPostingsWriter.VERSION_META_ARRAY) {
|
||||
fields = null;
|
||||
} else {
|
||||
fields = new TreeMap<Integer, Integer>();
|
||||
String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, PulsingPostingsWriter.SUMMARY_EXTENSION);
|
||||
IndexInput in = null;
|
||||
try {
|
||||
in = segmentState.directory.openInput(summaryFileName, segmentState.context);
|
||||
CodecUtil.checkHeader(in, PulsingPostingsWriter.CODEC, version,
|
||||
PulsingPostingsWriter.VERSION_CURRENT);
|
||||
int numField = in.readVInt();
|
||||
for (int i = 0; i < numField; i++) {
|
||||
int fieldNum = in.readVInt();
|
||||
int longsSize = in.readVInt();
|
||||
fields.put(fieldNum, longsSize);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class PulsingTermState extends BlockTermState {
|
||||
private boolean absolute = false;
|
||||
private long[] longs;
|
||||
private byte[] postings;
|
||||
private int postingsSize; // -1 if this term was not inlined
|
||||
private BlockTermState wrappedTermState;
|
||||
|
||||
ByteArrayDataInput inlinedBytesReader;
|
||||
private byte[] inlinedBytes;
|
||||
|
||||
@Override
|
||||
public PulsingTermState clone() {
|
||||
PulsingTermState clone;
|
||||
|
@ -82,6 +112,11 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
} else {
|
||||
assert wrappedTermState != null;
|
||||
clone.wrappedTermState = (BlockTermState) wrappedTermState.clone();
|
||||
clone.absolute = absolute;
|
||||
if (longs != null) {
|
||||
clone.longs = new long[longs.length];
|
||||
System.arraycopy(longs, 0, clone.longs, 0, longs.length);
|
||||
}
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
@ -99,11 +134,6 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
} else {
|
||||
wrappedTermState.copyFrom(other.wrappedTermState);
|
||||
}
|
||||
|
||||
// NOTE: we do not copy the
|
||||
// inlinedBytes/inlinedBytesReader; these are only
|
||||
// stored on the "primary" TermState. They are
|
||||
// "transient" to cloned term states.
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -116,25 +146,6 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
//System.out.println("PR.readTermsBlock state=" + _termState);
|
||||
final PulsingTermState termState = (PulsingTermState) _termState;
|
||||
if (termState.inlinedBytes == null) {
|
||||
termState.inlinedBytes = new byte[128];
|
||||
termState.inlinedBytesReader = new ByteArrayDataInput();
|
||||
}
|
||||
int len = termsIn.readVInt();
|
||||
//System.out.println(" len=" + len + " fp=" + termsIn.getFilePointer());
|
||||
if (termState.inlinedBytes.length < len) {
|
||||
termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
}
|
||||
termsIn.readBytes(termState.inlinedBytes, 0, len);
|
||||
termState.inlinedBytesReader.reset(termState.inlinedBytes);
|
||||
termState.wrappedTermState.termBlockOrd = 0;
|
||||
wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState newTermState() throws IOException {
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
|
@ -143,20 +154,20 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException {
|
||||
//System.out.println("PR nextTerm");
|
||||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
||||
assert empty.length == 0;
|
||||
termState.absolute = termState.absolute || absolute;
|
||||
// if we have positions, its total TF, otherwise its computed based on docFreq.
|
||||
long count = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq;
|
||||
//System.out.println(" count=" + count + " threshold=" + maxPositions);
|
||||
|
||||
if (count <= maxPositions) {
|
||||
|
||||
// Inlined into terms dict -- just read the byte[] blob in,
|
||||
// but don't decode it now (we only decode when a DocsEnum
|
||||
// or D&PEnum is pulled):
|
||||
termState.postingsSize = termState.inlinedBytesReader.readVInt();
|
||||
termState.postingsSize = in.readVInt();
|
||||
if (termState.postings == null || termState.postings.length < termState.postingsSize) {
|
||||
termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)];
|
||||
}
|
||||
|
@ -164,16 +175,23 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
// (the blob holding all inlined terms' blobs for
|
||||
// current term block) into another byte[] (just the
|
||||
// blob for this term)...
|
||||
termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize);
|
||||
in.readBytes(termState.postings, 0, termState.postingsSize);
|
||||
//System.out.println(" inlined bytes=" + termState.postingsSize);
|
||||
termState.absolute = termState.absolute || absolute;
|
||||
} else {
|
||||
//System.out.println(" not inlined");
|
||||
final int longsSize = fields == null ? 0 : fields.get(fieldInfo.number);
|
||||
if (termState.longs == null) {
|
||||
termState.longs = new long[longsSize];
|
||||
}
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
termState.longs[i] = in.readVLong();
|
||||
}
|
||||
termState.postingsSize = -1;
|
||||
// TODO: should we do full copyFrom? much heavier...?
|
||||
termState.wrappedTermState.docFreq = termState.docFreq;
|
||||
termState.wrappedTermState.totalTermFreq = termState.totalTermFreq;
|
||||
wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState);
|
||||
termState.wrappedTermState.termBlockOrd++;
|
||||
wrappedPostingsReader.decodeTerm(termState.longs, in, fieldInfo, termState.wrappedTermState, termState.absolute);
|
||||
termState.absolute = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,14 +21,19 @@ import java.io.IOException;
|
|||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// TODO: we now inline based on total TF of the term,
|
||||
// but it might be better to inline by "net bytes used"
|
||||
|
@ -49,26 +54,43 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
final static String CODEC = "PulsedPostingsWriter";
|
||||
|
||||
// recording field summary
|
||||
final static String SUMMARY_EXTENSION = "smy";
|
||||
|
||||
// To add a new version, increment from the last one, and
|
||||
// change VERSION_CURRENT to point to your new version:
|
||||
final static int VERSION_START = 0;
|
||||
|
||||
final static int VERSION_CURRENT = VERSION_START;
|
||||
final static int VERSION_META_ARRAY = 0;
|
||||
|
||||
final static int VERSION_CURRENT = VERSION_META_ARRAY;
|
||||
|
||||
private SegmentWriteState segmentState;
|
||||
private IndexOutput termsOut;
|
||||
|
||||
private List<FieldMetaData> fields;
|
||||
|
||||
private IndexOptions indexOptions;
|
||||
private boolean storePayloads;
|
||||
|
||||
private static class PendingTerm {
|
||||
private final byte[] bytes;
|
||||
public PendingTerm(byte[] bytes) {
|
||||
this.bytes = bytes;
|
||||
// information for wrapped PF, in current field
|
||||
private int longsSize;
|
||||
private long[] longs;
|
||||
boolean absolute;
|
||||
|
||||
private static class PulsingTermState extends BlockTermState {
|
||||
private byte[] bytes;
|
||||
private BlockTermState wrappedState;
|
||||
@Override
|
||||
public String toString() {
|
||||
if (bytes != null) {
|
||||
return "inlined";
|
||||
} else {
|
||||
return "not inlined wrapped=" + wrappedState;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
// one entry per position
|
||||
private final Position[] pending;
|
||||
private int pendingCount = 0; // -1 once we've hit too many positions
|
||||
|
@ -83,6 +105,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
int endOffset;
|
||||
}
|
||||
|
||||
private static final class FieldMetaData {
|
||||
int fieldNumber;
|
||||
int longsSize;
|
||||
FieldMetaData(int number, int size) {
|
||||
fieldNumber = number;
|
||||
longsSize = size;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: -- lazy init this? ie, if every single term
|
||||
// was inlined (eg for a "primary key" field) then we
|
||||
// never need to use this fallback? Fallback writer for
|
||||
|
@ -92,23 +123,33 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
/** If the total number of positions (summed across all docs
|
||||
* for this term) is <= maxPositions, then the postings are
|
||||
* inlined into terms dict */
|
||||
public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
|
||||
public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
|
||||
|
||||
pending = new Position[maxPositions];
|
||||
for(int i=0;i<maxPositions;i++) {
|
||||
pending[i] = new Position();
|
||||
}
|
||||
fields = new ArrayList<FieldMetaData>();
|
||||
|
||||
// We simply wrap another postings writer, but only call
|
||||
// on it when tot positions is >= the cutoff:
|
||||
this.wrappedPostingsWriter = wrappedPostingsWriter;
|
||||
this.segmentState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
public void init(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(pending.length); // encode maxPositions in header
|
||||
wrappedPostingsWriter.start(termsOut);
|
||||
wrappedPostingsWriter.init(termsOut);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState newTermState() throws IOException {
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
state.wrappedState = wrappedPostingsWriter.newTermState();
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -123,11 +164,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
// Currently, this instance is re-used across fields, so
|
||||
// our parent calls setField whenever the field changes
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
this.indexOptions = fieldInfo.getIndexOptions();
|
||||
//if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
|
||||
storePayloads = fieldInfo.hasPayloads();
|
||||
wrappedPostingsWriter.setField(fieldInfo);
|
||||
absolute = false;
|
||||
longsSize = wrappedPostingsWriter.setField(fieldInfo);
|
||||
longs = new long[longsSize];
|
||||
fields.add(new FieldMetaData(fieldInfo.number, longsSize));
|
||||
return 0;
|
||||
//DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
}
|
||||
|
||||
|
@ -219,18 +264,19 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
PulsingTermState state = (PulsingTermState) _state;
|
||||
|
||||
// if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());
|
||||
|
||||
assert pendingCount > 0 || pendingCount == -1;
|
||||
|
||||
if (pendingCount == -1) {
|
||||
wrappedPostingsWriter.finishTerm(stats);
|
||||
// Must add null entry to record terms that our
|
||||
// wrapped postings impl added
|
||||
pendingTerms.add(null);
|
||||
state.wrappedState.docFreq = state.docFreq;
|
||||
state.wrappedState.totalTermFreq = state.totalTermFreq;
|
||||
state.bytes = null;
|
||||
wrappedPostingsWriter.finishTerm(state.wrappedState);
|
||||
} else {
|
||||
|
||||
// There were few enough total occurrences for this
|
||||
// term, so we fully inline our postings data into
|
||||
// terms dict, now:
|
||||
|
@ -325,61 +371,54 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
}
|
||||
|
||||
final byte[] bytes = new byte[(int) buffer.getFilePointer()];
|
||||
buffer.writeTo(bytes, 0);
|
||||
pendingTerms.add(new PendingTerm(bytes));
|
||||
state.bytes = new byte[(int) buffer.getFilePointer()];
|
||||
buffer.writeTo(state.bytes, 0);
|
||||
buffer.reset();
|
||||
}
|
||||
|
||||
pendingCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
PulsingTermState state = (PulsingTermState)_state;
|
||||
assert empty.length == 0;
|
||||
this.absolute = this.absolute || absolute;
|
||||
if (state.bytes == null) {
|
||||
wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute);
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
out.writeVLong(longs[i]);
|
||||
}
|
||||
buffer.writeTo(out);
|
||||
buffer.reset();
|
||||
this.absolute = false;
|
||||
} else {
|
||||
out.writeVInt(state.bytes.length);
|
||||
out.writeBytes(state.bytes, 0, state.bytes.length);
|
||||
this.absolute = this.absolute || absolute;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
wrappedPostingsWriter.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
// if (DEBUG) System.out.println("PW: flushTermsBlock start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size());
|
||||
int wrappedCount = 0;
|
||||
assert buffer.getFilePointer() == 0;
|
||||
assert start >= count;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
for(int idx=pendingTerms.size()-start; idx<limit; idx++) {
|
||||
final PendingTerm term = pendingTerms.get(idx);
|
||||
if (term == null) {
|
||||
wrappedCount++;
|
||||
} else {
|
||||
buffer.writeVInt(term.bytes.length);
|
||||
buffer.writeBytes(term.bytes, 0, term.bytes.length);
|
||||
}
|
||||
if (wrappedPostingsWriter instanceof PulsingPostingsWriter ||
|
||||
VERSION_CURRENT < VERSION_META_ARRAY) {
|
||||
return;
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) buffer.getFilePointer());
|
||||
buffer.writeTo(termsOut);
|
||||
buffer.reset();
|
||||
|
||||
// TDOO: this could be somewhat costly since
|
||||
// pendingTerms.size() could be biggish?
|
||||
int futureWrappedCount = 0;
|
||||
final int limit2 = pendingTerms.size();
|
||||
for(int idx=limit;idx<limit2;idx++) {
|
||||
if (pendingTerms.get(idx) == null) {
|
||||
futureWrappedCount++;
|
||||
String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, SUMMARY_EXTENSION);
|
||||
IndexOutput out = null;
|
||||
try {
|
||||
out = segmentState.directory.createOutput(summaryFileName, segmentState.context);
|
||||
CodecUtil.writeHeader(out, CODEC, VERSION_CURRENT);
|
||||
out.writeVInt(fields.size());
|
||||
for (FieldMetaData field : fields) {
|
||||
out.writeVInt(field.fieldNumber);
|
||||
out.writeVInt(field.longsSize);
|
||||
}
|
||||
out.close();
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(pendingTerms.size()-start, limit).clear();
|
||||
|
||||
// if (DEBUG) System.out.println("PW: len=" + buffer.getFilePointer() + " fp=" + termsOut.getFilePointer() + " futureWrappedCount=" + futureWrappedCount + " wrappedCount=" + wrappedCount);
|
||||
// TODO: can we avoid calling this if all terms
|
||||
// were inlined...? Eg for a "primary key" field, the
|
||||
// wrapped codec is never invoked...
|
||||
wrappedPostingsWriter.flushTermsBlock(futureWrappedCount+wrappedCount, wrappedCount);
|
||||
}
|
||||
|
||||
// Pushes pending positions to the wrapped codec
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -115,15 +116,6 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
long payloadFP;
|
||||
long skipFP;
|
||||
|
||||
// Only used for "primary" term state; these are never
|
||||
// copied on clone:
|
||||
|
||||
// TODO: these should somehow be stored per-TermsEnum
|
||||
// not per TermState; maybe somehow the terms dict
|
||||
// should load/manage the byte[]/DataReader for us?
|
||||
byte[] bytes;
|
||||
ByteArrayDataInput bytesReader;
|
||||
|
||||
@Override
|
||||
public SepTermState clone() {
|
||||
SepTermState other = new SepTermState();
|
||||
|
@ -182,40 +174,21 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
|
||||
throws IOException {
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
//System.out.println("SEPR: readTermsBlock termsIn.fp=" + termsIn.getFilePointer());
|
||||
final int len = termsIn.readVInt();
|
||||
//System.out.println(" numBytes=" + len);
|
||||
if (termState.bytes == null) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
termState.bytesReader = new ByteArrayDataInput(termState.bytes);
|
||||
} else if (termState.bytes.length < len) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
}
|
||||
termState.bytesReader.reset(termState.bytes, 0, len);
|
||||
termsIn.readBytes(termState.bytes, 0, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
final boolean isFirstTerm = termState.termBlockOrd == 0;
|
||||
//System.out.println("SEPR.nextTerm termCount=" + termState.termBlockOrd + " isFirstTerm=" + isFirstTerm + " bytesReader.pos=" + termState.bytesReader.getPosition());
|
||||
//System.out.println(" docFreq=" + termState.docFreq);
|
||||
termState.docIndex.read(termState.bytesReader, isFirstTerm);
|
||||
//System.out.println(" docIndex=" + termState.docIndex);
|
||||
termState.docIndex.read(in, absolute);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
termState.freqIndex.read(termState.bytesReader, isFirstTerm);
|
||||
termState.freqIndex.read(in, absolute);
|
||||
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
//System.out.println(" freqIndex=" + termState.freqIndex);
|
||||
termState.posIndex.read(termState.bytesReader, isFirstTerm);
|
||||
termState.posIndex.read(in, absolute);
|
||||
//System.out.println(" posIndex=" + termState.posIndex);
|
||||
if (fieldInfo.hasPayloads()) {
|
||||
if (isFirstTerm) {
|
||||
termState.payloadFP = termState.bytesReader.readVLong();
|
||||
if (absolute) {
|
||||
termState.payloadFP = in.readVLong();
|
||||
} else {
|
||||
termState.payloadFP += termState.bytesReader.readVLong();
|
||||
termState.payloadFP += in.readVLong();
|
||||
}
|
||||
//System.out.println(" payloadFP=" + termState.payloadFP);
|
||||
}
|
||||
|
@ -223,14 +196,14 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
if (termState.docFreq >= skipMinimum) {
|
||||
//System.out.println(" readSkip @ " + termState.bytesReader.getPosition());
|
||||
if (isFirstTerm) {
|
||||
termState.skipFP = termState.bytesReader.readVLong();
|
||||
//System.out.println(" readSkip @ " + in.getPosition());
|
||||
if (absolute) {
|
||||
termState.skipFP = in.readVLong();
|
||||
} else {
|
||||
termState.skipFP += termState.bytesReader.readVLong();
|
||||
termState.skipFP += in.readVLong();
|
||||
}
|
||||
//System.out.println(" skipFP=" + termState.skipFP);
|
||||
} else if (isFirstTerm) {
|
||||
} else if (absolute) {
|
||||
termState.skipFP = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,18 +18,17 @@ package org.apache.lucene.codecs.sep;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -64,7 +63,6 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
IndexOutput payloadOut;
|
||||
|
||||
IndexOutput skipOut;
|
||||
IndexOutput termsOut;
|
||||
|
||||
final SepSkipListWriter skipListWriter;
|
||||
/** Expert: The fraction of TermDocs entries stored in skip tables,
|
||||
|
@ -87,8 +85,6 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
final int totalNumDocs;
|
||||
|
||||
PendingTerm lastState;
|
||||
|
||||
boolean storePayloads;
|
||||
IndexOptions indexOptions;
|
||||
|
||||
|
@ -100,8 +96,9 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
int lastDocID;
|
||||
int df;
|
||||
|
||||
// Holds pending byte[] blob for the current terms block
|
||||
private final RAMOutputStream indexBytesWriter = new RAMOutputStream();
|
||||
SepTermState lastState;
|
||||
long lastPayloadFP;
|
||||
long lastSkipFP;
|
||||
|
||||
public SepPostingsWriter(SegmentWriteState state, IntStreamFactory factory) throws IOException {
|
||||
this(state, factory, DEFAULT_SKIP_INTERVAL);
|
||||
|
@ -121,6 +118,7 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
docOut = factory.createOutput(state.directory, docFileName, state.context);
|
||||
docIndex = docOut.index();
|
||||
|
||||
if (state.fieldInfos.hasFreq()) {
|
||||
final String frqFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FREQ_EXTENSION);
|
||||
freqOut = factory.createOutput(state.directory, frqFileName, state.context);
|
||||
|
@ -157,8 +155,7 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
public void init(IndexOutput termsOut) throws IOException {
|
||||
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
|
||||
// TODO: -- just ask skipper to "start" here
|
||||
termsOut.writeInt(skipInterval); // write skipInterval
|
||||
|
@ -166,6 +163,11 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
termsOut.writeInt(skipMinimum); // write skipMinimum
|
||||
}
|
||||
|
||||
@Override
|
||||
public SepTermState newTermState() {
|
||||
return new SepTermState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm() throws IOException {
|
||||
docIndex.mark();
|
||||
|
@ -187,7 +189,7 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
// Currently, this instance is re-used across fields, so
|
||||
// our parent calls setField whenever the field changes
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.indexOptions = fieldInfo.getIndexOptions();
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
|
@ -195,6 +197,24 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
skipListWriter.setIndexOptions(indexOptions);
|
||||
storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads();
|
||||
lastPayloadFP = 0;
|
||||
lastSkipFP = 0;
|
||||
lastState = setEmptyState();
|
||||
return 0;
|
||||
}
|
||||
|
||||
private SepTermState setEmptyState() {
|
||||
SepTermState emptyState = new SepTermState();
|
||||
emptyState.docIndex = docOut.index();
|
||||
if (indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
emptyState.freqIndex = freqOut.index();
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
emptyState.posIndex = posOut.index();
|
||||
}
|
||||
}
|
||||
emptyState.payloadFP = 0;
|
||||
emptyState.skipFP = 0;
|
||||
return emptyState;
|
||||
}
|
||||
|
||||
/** Adds a new doc in this term. If this returns null
|
||||
|
@ -262,135 +282,86 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
lastPosition = 0;
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final IntIndexOutput.Index docIndex;
|
||||
public final IntIndexOutput.Index freqIndex;
|
||||
public final IntIndexOutput.Index posIndex;
|
||||
private static class SepTermState extends BlockTermState {
|
||||
public IntIndexOutput.Index docIndex;
|
||||
public IntIndexOutput.Index freqIndex;
|
||||
public IntIndexOutput.Index posIndex;
|
||||
public long payloadFP;
|
||||
public long skipFP;
|
||||
|
||||
public PendingTerm(IntIndexOutput.Index docIndex, IntIndexOutput.Index freqIndex, IntIndexOutput.Index posIndex, long payloadFP, long skipFP) {
|
||||
this.docIndex = docIndex;
|
||||
this.freqIndex = freqIndex;
|
||||
this.posIndex = posIndex;
|
||||
this.payloadFP = payloadFP;
|
||||
this.skipFP = skipFP;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
SepTermState state = (SepTermState)_state;
|
||||
// TODO: -- wasteful we are counting this in two places?
|
||||
assert stats.docFreq > 0;
|
||||
assert stats.docFreq == df;
|
||||
assert state.docFreq > 0;
|
||||
assert state.docFreq == df;
|
||||
|
||||
final IntIndexOutput.Index docIndexCopy = docOut.index();
|
||||
docIndexCopy.copyFrom(docIndex, false);
|
||||
|
||||
final IntIndexOutput.Index freqIndexCopy;
|
||||
final IntIndexOutput.Index posIndexCopy;
|
||||
state.docIndex = docOut.index();
|
||||
state.docIndex.copyFrom(docIndex, false);
|
||||
if (indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
freqIndexCopy = freqOut.index();
|
||||
freqIndexCopy.copyFrom(freqIndex, false);
|
||||
state.freqIndex = freqOut.index();
|
||||
state.freqIndex.copyFrom(freqIndex, false);
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
posIndexCopy = posOut.index();
|
||||
posIndexCopy.copyFrom(posIndex, false);
|
||||
state.posIndex = posOut.index();
|
||||
state.posIndex.copyFrom(posIndex, false);
|
||||
} else {
|
||||
posIndexCopy = null;
|
||||
state.posIndex = null;
|
||||
}
|
||||
} else {
|
||||
freqIndexCopy = null;
|
||||
posIndexCopy = null;
|
||||
state.freqIndex = null;
|
||||
state.posIndex = null;
|
||||
}
|
||||
|
||||
final long skipFP;
|
||||
if (df >= skipMinimum) {
|
||||
skipFP = skipOut.getFilePointer();
|
||||
state.skipFP = skipOut.getFilePointer();
|
||||
//System.out.println(" skipFP=" + skipFP);
|
||||
skipListWriter.writeSkip(skipOut);
|
||||
//System.out.println(" numBytes=" + (skipOut.getFilePointer()-skipFP));
|
||||
} else {
|
||||
skipFP = -1;
|
||||
state.skipFP = -1;
|
||||
}
|
||||
state.payloadFP = payloadStart;
|
||||
|
||||
lastDocID = 0;
|
||||
df = 0;
|
||||
|
||||
pendingTerms.add(new PendingTerm(docIndexCopy,
|
||||
freqIndexCopy,
|
||||
posIndexCopy,
|
||||
payloadStart,
|
||||
skipFP));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
//System.out.println("SEPW: flushTermsBlock: start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size() + " termsOut.fp=" + termsOut.getFilePointer());
|
||||
assert indexBytesWriter.getFilePointer() == 0;
|
||||
final int absStart = pendingTerms.size() - start;
|
||||
final List<PendingTerm> slice = pendingTerms.subList(absStart, absStart+count);
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
SepTermState state = (SepTermState)_state;
|
||||
if (absolute) {
|
||||
lastSkipFP = 0;
|
||||
lastPayloadFP = 0;
|
||||
lastState = state;
|
||||
}
|
||||
|
||||
long lastSkipFP = 0;
|
||||
long lastPayloadFP = 0;
|
||||
|
||||
boolean isFirstTerm = true;
|
||||
|
||||
for(int idx=0;idx<slice.size();idx++) {
|
||||
if (isFirstTerm) {
|
||||
lastState = slice.get(idx);
|
||||
}
|
||||
final PendingTerm t = slice.get(idx);
|
||||
//System.out.println(" last(pure): doc="+lastState.docIndex +" frq=" + lastState.freqIndex+" pos="+lastState.posIndex);
|
||||
lastState.docIndex.copyFrom(t.docIndex, false);
|
||||
lastState.docIndex.write(indexBytesWriter, isFirstTerm);
|
||||
//System.out.print(" doc=" + lastState.docIndex + " 1FP=" + indexBytesWriter.getFilePointer());
|
||||
if (indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
lastState.freqIndex.copyFrom(t.freqIndex, false);
|
||||
lastState.freqIndex.write(indexBytesWriter, isFirstTerm);
|
||||
//System.out.print(" frq=" + lastState.freqIndex + " 2FP=" + indexBytesWriter.getFilePointer());
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
lastState.posIndex.copyFrom(t.posIndex, false);
|
||||
lastState.posIndex.write(indexBytesWriter, isFirstTerm);
|
||||
//System.out.print(" pos=" + lastState.posIndex + " 3FP=" + indexBytesWriter.getFilePointer());
|
||||
if (storePayloads) {
|
||||
if (isFirstTerm) {
|
||||
//System.out.print(" payFP=" + (t.payloadFP));
|
||||
indexBytesWriter.writeVLong(t.payloadFP);
|
||||
} else {
|
||||
//System.out.print(" payFP=" + (t.payloadFP - lastPayloadFP));
|
||||
indexBytesWriter.writeVLong(t.payloadFP - lastPayloadFP);
|
||||
}
|
||||
lastPayloadFP = t.payloadFP;
|
||||
lastState.docIndex.copyFrom(state.docIndex, false);
|
||||
lastState.docIndex.write(out, absolute);
|
||||
if (indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
lastState.freqIndex.copyFrom(state.freqIndex, false);
|
||||
lastState.freqIndex.write(out, absolute);
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
lastState.posIndex.copyFrom(state.posIndex, false);
|
||||
lastState.posIndex.write(out, absolute);
|
||||
if (storePayloads) {
|
||||
if (absolute) {
|
||||
out.writeVLong(state.payloadFP);
|
||||
} else {
|
||||
out.writeVLong(state.payloadFP - lastPayloadFP);
|
||||
}
|
||||
lastPayloadFP = state.payloadFP;
|
||||
}
|
||||
}
|
||||
if (t.skipFP != -1) {
|
||||
if (isFirstTerm) {
|
||||
//System.out.print(" a.skipFP=" + (t.skipFP));
|
||||
indexBytesWriter.writeVLong(t.skipFP);
|
||||
} else {
|
||||
//System.out.print(" b.skipFP=" + (t.skipFP - lastSkipFP));
|
||||
indexBytesWriter.writeVLong(t.skipFP - lastSkipFP);
|
||||
}
|
||||
lastSkipFP = t.skipFP;
|
||||
}
|
||||
//System.out.println();
|
||||
//System.out.println(" last(copy): doc="+lastState.docIndex +" frq=" + lastState.freqIndex+" pos="+lastState.posIndex);
|
||||
isFirstTerm = false;
|
||||
}
|
||||
|
||||
termsOut.writeVLong((int) indexBytesWriter.getFilePointer());
|
||||
indexBytesWriter.writeTo(termsOut);
|
||||
indexBytesWriter.reset();
|
||||
slice.clear();
|
||||
if (state.skipFP != -1) {
|
||||
if (absolute) {
|
||||
out.writeVLong(state.skipFP);
|
||||
} else {
|
||||
out.writeVLong(state.skipFP - lastSkipFP);
|
||||
}
|
||||
lastSkipFP = state.skipFP;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -158,6 +158,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
|
||||
final long sumDocFreq = in.readVLong();
|
||||
final int docCount = in.readVInt();
|
||||
final int longsSize = in.readVInt();
|
||||
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
|
||||
}
|
||||
|
@ -168,7 +169,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
|
||||
}
|
||||
final long indexStartFP = indexIn.readVLong();
|
||||
FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn));
|
||||
FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn));
|
||||
if (previous != null) {
|
||||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
|
||||
}
|
||||
|
@ -448,11 +449,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
final long indexStartFP;
|
||||
final long rootBlockFP;
|
||||
final BytesRef rootCode;
|
||||
private final FST<BytesRef> index;
|
||||
final int longsSize;
|
||||
|
||||
private final FST<BytesRef> index;
|
||||
//private boolean DEBUG;
|
||||
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException {
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) throws IOException {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
|
||||
|
@ -462,6 +464,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
this.docCount = docCount;
|
||||
this.indexStartFP = indexStartFP;
|
||||
this.rootCode = rootCode;
|
||||
this.longsSize = longsSize;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
|
||||
// }
|
||||
|
@ -612,6 +615,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
FST.Arc<BytesRef> arc;
|
||||
|
||||
final BlockTermState termState;
|
||||
|
||||
// metadata buffer, holding monotonical values
|
||||
public long[] longs;
|
||||
// metadata buffer, holding general values
|
||||
public byte[] bytes;
|
||||
ByteArrayDataInput bytesReader;
|
||||
|
||||
// Cumulative output so far
|
||||
BytesRef outputPrefix;
|
||||
|
@ -621,8 +630,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
public Frame(int ord) throws IOException {
|
||||
this.ord = ord;
|
||||
termState = postingsReader.newTermState();
|
||||
termState.totalTermFreq = -1;
|
||||
this.termState = postingsReader.newTermState();
|
||||
this.termState.totalTermFreq = -1;
|
||||
this.longs = new long[longsSize];
|
||||
}
|
||||
|
||||
void loadNextFloorBlock() throws IOException {
|
||||
|
@ -720,8 +730,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
termState.termBlockOrd = 0;
|
||||
nextEnt = 0;
|
||||
|
||||
postingsReader.readTermsBlock(in, fieldInfo, termState);
|
||||
|
||||
// metadata
|
||||
numBytes = in.readVInt();
|
||||
if (bytes == null) {
|
||||
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
bytesReader = new ByteArrayDataInput();
|
||||
} else if (bytes.length < numBytes) {
|
||||
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
}
|
||||
in.readBytes(bytes, 0, numBytes);
|
||||
bytesReader.reset(bytes, 0, numBytes);
|
||||
|
||||
if (!isLastInFloor) {
|
||||
// Sub-blocks of a single floor block are always
|
||||
|
@ -774,12 +793,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// lazily catch up on metadata decode:
|
||||
final int limit = getTermBlockOrd();
|
||||
boolean absolute = metaDataUpto == 0;
|
||||
assert limit > 0;
|
||||
|
||||
// We must set/incr state.termCount because
|
||||
// postings impl can look at this
|
||||
termState.termBlockOrd = metaDataUpto;
|
||||
|
||||
// TODO: better API would be "jump straight to term=N"???
|
||||
while (metaDataUpto < limit) {
|
||||
|
||||
|
@ -791,17 +807,24 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// TODO: if docFreq were bulk decoded we could
|
||||
// just skipN here:
|
||||
|
||||
// stats
|
||||
termState.docFreq = statsReader.readVInt();
|
||||
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
|
||||
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
|
||||
}
|
||||
// metadata
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
longs[i] = bytesReader.readVLong();
|
||||
}
|
||||
postingsReader.decodeTerm(longs, bytesReader, fieldInfo, termState, absolute);
|
||||
|
||||
postingsReader.nextTerm(fieldInfo, termState);
|
||||
metaDataUpto++;
|
||||
termState.termBlockOrd++;
|
||||
absolute = false;
|
||||
}
|
||||
termState.termBlockOrd = metaDataUpto;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1707,6 +1730,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
if (arc.output != NO_OUTPUT) {
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
|
||||
// }
|
||||
|
@ -2290,10 +2314,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
final BlockTermState state;
|
||||
|
||||
// metadata buffer, holding monotonical values
|
||||
public long[] longs;
|
||||
// metadata buffer, holding general values
|
||||
public byte[] bytes;
|
||||
ByteArrayDataInput bytesReader;
|
||||
|
||||
public Frame(int ord) throws IOException {
|
||||
this.ord = ord;
|
||||
state = postingsReader.newTermState();
|
||||
state.totalTermFreq = -1;
|
||||
this.state = postingsReader.newTermState();
|
||||
this.state.totalTermFreq = -1;
|
||||
this.longs = new long[longsSize];
|
||||
}
|
||||
|
||||
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
|
||||
|
@ -2391,7 +2422,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// TODO: we could skip this if !hasTerms; but
|
||||
// that's rare so won't help much
|
||||
postingsReader.readTermsBlock(in, fieldInfo, state);
|
||||
// metadata
|
||||
numBytes = in.readVInt();
|
||||
if (bytes == null) {
|
||||
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
bytesReader = new ByteArrayDataInput();
|
||||
} else if (bytes.length < numBytes) {
|
||||
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
}
|
||||
in.readBytes(bytes, 0, numBytes);
|
||||
bytesReader.reset(bytes, 0, numBytes);
|
||||
|
||||
|
||||
// Sub-blocks of a single floor block are always
|
||||
// written one after another -- tail recurse:
|
||||
|
@ -2575,12 +2616,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// lazily catch up on metadata decode:
|
||||
final int limit = getTermBlockOrd();
|
||||
boolean absolute = metaDataUpto == 0;
|
||||
assert limit > 0;
|
||||
|
||||
// We must set/incr state.termCount because
|
||||
// postings impl can look at this
|
||||
state.termBlockOrd = metaDataUpto;
|
||||
|
||||
// TODO: better API would be "jump straight to term=N"???
|
||||
while (metaDataUpto < limit) {
|
||||
|
||||
|
@ -2592,17 +2630,24 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// TODO: if docFreq were bulk decoded we could
|
||||
// just skipN here:
|
||||
|
||||
// stats
|
||||
state.docFreq = statsReader.readVInt();
|
||||
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
state.totalTermFreq = state.docFreq + statsReader.readVLong();
|
||||
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
|
||||
}
|
||||
// metadata
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
longs[i] = bytesReader.readVLong();
|
||||
}
|
||||
postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
|
||||
|
||||
postingsReader.nextTerm(fieldInfo, state);
|
||||
metaDataUpto++;
|
||||
state.termBlockOrd++;
|
||||
absolute = false;
|
||||
}
|
||||
state.termBlockOrd = metaDataUpto;
|
||||
}
|
||||
|
||||
// Used only by assert
|
||||
|
|
|
@ -104,13 +104,12 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* and decoding the Postings Metadata and Term Metadata sections.</p>
|
||||
*
|
||||
* <ul>
|
||||
* <!-- TODO: expand on this, its not really correct and doesnt explain sub-blocks etc -->
|
||||
* <li>TermsDict (.tim) --> Header, <i>Postings Metadata</i>, Block<sup>NumBlocks</sup>,
|
||||
* <li>TermsDict (.tim) --> Header, <i>Postings Header</i>, NodeBlock<sup>NumBlocks</sup>,
|
||||
* FieldSummary, DirOffset</li>
|
||||
* <li>Block --> SuffixBlock, StatsBlock, MetadataBlock</li>
|
||||
* <li>SuffixBlock --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup></li>
|
||||
* <li>StatsBlock --> StatsLength, <DocFreq, TotalTermFreq><sup>EntryCount</sup></li>
|
||||
* <li>MetadataBlock --> MetaLength, <<i>Term Metadata</i>><sup>EntryCount</sup></li>
|
||||
* <li>NodeBlock --> (OuterNode | InnerNode)</li>
|
||||
* <li>OuterNode --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ><sup>EntryCount</sup>, MetaLength, <<i>Term Metadata</i>><sup>EntryCount</sup></li>
|
||||
* <li>InnerNode --> EntryCount, SuffixLength[,Sub?], Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ? ><sup>EntryCount</sup>, MetaLength, <<i>Term Metadata ? </i>><sup>EntryCount</sup></li>
|
||||
* <li>TermStats --> DocFreq, TotalTermFreq </li>
|
||||
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, Byte<sup>RootCodeLength</sup>,
|
||||
* SumDocFreq, DocCount><sup>NumFields</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
|
@ -136,7 +135,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>DocCount is the number of documents that have at least one posting for this field.</li>
|
||||
* <li>PostingsMetadata and TermMetadata are plugged into by the specific postings implementation:
|
||||
* these contain arbitrary per-file data (such as parameters or versioning information)
|
||||
* and per-term data (such as pointers to inverted files).
|
||||
* and per-term data (such as pointers to inverted files).</li>
|
||||
* <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
|
||||
* to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted </li>
|
||||
* </ul>
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <h3>Term Index</h3>
|
||||
|
@ -237,8 +238,9 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
public final long sumTotalTermFreq;
|
||||
public final long sumDocFreq;
|
||||
public final int docCount;
|
||||
private final int longsSize;
|
||||
|
||||
public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount) {
|
||||
public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
|
||||
|
@ -248,6 +250,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
this.longsSize = longsSize;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -300,7 +303,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
// System.out.println("BTW.init seg=" + state.segmentName);
|
||||
|
||||
postingsWriter.start(out); // have consumer write its format/header
|
||||
postingsWriter.init(out); // have consumer write its format/header
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -354,12 +357,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
private static final class PendingTerm extends PendingEntry {
|
||||
public final BytesRef term;
|
||||
public final TermStats stats;
|
||||
// stats + metadata
|
||||
public final BlockTermState state;
|
||||
|
||||
public PendingTerm(BytesRef term, TermStats stats) {
|
||||
public PendingTerm(BytesRef term, BlockTermState state) {
|
||||
super(true);
|
||||
this.term = term;
|
||||
this.stats = stats;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -480,6 +484,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
class TermsWriter extends TermsConsumer {
|
||||
private final FieldInfo fieldInfo;
|
||||
private final int longsSize;
|
||||
private long numTerms;
|
||||
long sumTotalTermFreq;
|
||||
long sumDocFreq;
|
||||
|
@ -839,11 +844,16 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
final List<FST<BytesRef>> subIndices;
|
||||
|
||||
int termCount;
|
||||
|
||||
long[] longs = new long[longsSize];
|
||||
boolean absolute = true;
|
||||
|
||||
if (isLeafBlock) {
|
||||
subIndices = null;
|
||||
for (PendingEntry ent : slice) {
|
||||
assert ent.isTerm;
|
||||
PendingTerm term = (PendingTerm) ent;
|
||||
BlockTermState state = term.state;
|
||||
final int suffix = term.term.length - prefixLength;
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytes = new BytesRef(suffix);
|
||||
|
@ -852,15 +862,25 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// System.out.println(" write term suffix=" + suffixBytes);
|
||||
// }
|
||||
// For leaf block we write suffix straight
|
||||
bytesWriter.writeVInt(suffix);
|
||||
bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix);
|
||||
suffixWriter.writeVInt(suffix);
|
||||
suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
|
||||
|
||||
// Write term stats, to separate byte[] blob:
|
||||
bytesWriter2.writeVInt(term.stats.docFreq);
|
||||
statsWriter.writeVInt(state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
|
||||
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
||||
assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
|
||||
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
|
||||
}
|
||||
|
||||
// Write term meta data
|
||||
postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
|
||||
for (int pos = 0; pos < longsSize; pos++) {
|
||||
assert longs[pos] >= 0;
|
||||
metaWriter.writeVLong(longs[pos]);
|
||||
}
|
||||
bytesWriter.writeTo(metaWriter);
|
||||
bytesWriter.reset();
|
||||
absolute = false;
|
||||
}
|
||||
termCount = length;
|
||||
} else {
|
||||
|
@ -869,6 +889,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
for (PendingEntry ent : slice) {
|
||||
if (ent.isTerm) {
|
||||
PendingTerm term = (PendingTerm) ent;
|
||||
BlockTermState state = term.state;
|
||||
final int suffix = term.term.length - prefixLength;
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytes = new BytesRef(suffix);
|
||||
|
@ -878,16 +899,34 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// }
|
||||
// For non-leaf block we borrow 1 bit to record
|
||||
// if entry is term or sub-block
|
||||
bytesWriter.writeVInt(suffix<<1);
|
||||
bytesWriter.writeBytes(term.term.bytes, prefixLength, suffix);
|
||||
suffixWriter.writeVInt(suffix<<1);
|
||||
suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
|
||||
|
||||
// Write term stats, to separate byte[] blob:
|
||||
bytesWriter2.writeVInt(term.stats.docFreq);
|
||||
statsWriter.writeVInt(state.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq;
|
||||
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
||||
assert state.totalTermFreq >= state.docFreq;
|
||||
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
|
||||
}
|
||||
|
||||
// TODO: now that terms dict "sees" these longs,
|
||||
// we can explore better column-stride encodings
|
||||
// to encode all long[0]s for this block at
|
||||
// once, all long[1]s, etc., e.g. using
|
||||
// Simple64. Alternatively, we could interleave
|
||||
// stats + meta ... no reason to have them
|
||||
// separate anymore:
|
||||
|
||||
// Write term meta data
|
||||
postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
|
||||
for (int pos = 0; pos < longsSize; pos++) {
|
||||
assert longs[pos] >= 0;
|
||||
metaWriter.writeVLong(longs[pos]);
|
||||
}
|
||||
bytesWriter.writeTo(metaWriter);
|
||||
bytesWriter.reset();
|
||||
absolute = false;
|
||||
|
||||
termCount++;
|
||||
} else {
|
||||
PendingBlock block = (PendingBlock) ent;
|
||||
|
@ -897,8 +936,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
// For non-leaf block we borrow 1 bit to record
|
||||
// if entry is term or sub-block
|
||||
bytesWriter.writeVInt((suffix<<1)|1);
|
||||
bytesWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
|
||||
suffixWriter.writeVInt((suffix<<1)|1);
|
||||
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
|
||||
assert block.fp < startFP;
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -908,7 +947,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
|
||||
// }
|
||||
|
||||
bytesWriter.writeVLong(startFP - block.fp);
|
||||
suffixWriter.writeVLong(startFP - block.fp);
|
||||
subIndices.add(block.index);
|
||||
}
|
||||
}
|
||||
|
@ -921,17 +960,19 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// search on lookup
|
||||
|
||||
// Write suffixes byte[] blob to terms dict output:
|
||||
out.writeVInt((int) (bytesWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
|
||||
bytesWriter.writeTo(out);
|
||||
bytesWriter.reset();
|
||||
out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
|
||||
suffixWriter.writeTo(out);
|
||||
suffixWriter.reset();
|
||||
|
||||
// Write term stats byte[] blob
|
||||
out.writeVInt((int) bytesWriter2.getFilePointer());
|
||||
bytesWriter2.writeTo(out);
|
||||
bytesWriter2.reset();
|
||||
out.writeVInt((int) statsWriter.getFilePointer());
|
||||
statsWriter.writeTo(out);
|
||||
statsWriter.reset();
|
||||
|
||||
// Have postings writer write block
|
||||
postingsWriter.flushTermsBlock(futureTermCount+termCount, termCount);
|
||||
// Write term meta data byte[] blob
|
||||
out.writeVInt((int) metaWriter.getFilePointer());
|
||||
metaWriter.writeTo(out);
|
||||
metaWriter.reset();
|
||||
|
||||
// Remove slice replaced by block:
|
||||
slice.clear();
|
||||
|
@ -967,7 +1008,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
PackedInts.COMPACT,
|
||||
true, 15);
|
||||
|
||||
postingsWriter.setField(fieldInfo);
|
||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -998,8 +1039,13 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
//if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
|
||||
|
||||
blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
|
||||
pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats));
|
||||
postingsWriter.finishTerm(stats);
|
||||
BlockTermState state = postingsWriter.newTermState();
|
||||
state.docFreq = stats.docFreq;
|
||||
state.totalTermFreq = stats.totalTermFreq;
|
||||
postingsWriter.finishTerm(state);
|
||||
|
||||
PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
|
||||
pending.add(term);
|
||||
numTerms++;
|
||||
}
|
||||
|
||||
|
@ -1038,7 +1084,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
indexStartFP,
|
||||
sumTotalTermFreq,
|
||||
sumDocFreq,
|
||||
docCount));
|
||||
docCount,
|
||||
longsSize));
|
||||
} else {
|
||||
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
|
||||
assert sumDocFreq == 0;
|
||||
|
@ -1046,8 +1093,10 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
private final RAMOutputStream suffixWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream statsWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream metaWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream bytesWriter2 = new RAMOutputStream();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1072,6 +1121,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
out.writeVLong(field.sumDocFreq);
|
||||
out.writeVInt(field.docCount);
|
||||
out.writeVInt(field.longsSize);
|
||||
indexOut.writeVLong(field.indexStartFP);
|
||||
}
|
||||
writeTrailer(out, dirStart);
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/** The core terms dictionaries (BlockTermsReader,
|
||||
|
@ -55,7 +56,7 @@ public abstract class PostingsReaderBase implements Closeable {
|
|||
public abstract BlockTermState newTermState() throws IOException;
|
||||
|
||||
/** Actually decode metadata for next term */
|
||||
public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException;
|
||||
public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
|
||||
|
||||
/** Must fully consume state, since after this call that
|
||||
* TermState may be reused. */
|
||||
|
@ -68,9 +69,4 @@ public abstract class PostingsReaderBase implements Closeable {
|
|||
|
||||
@Override
|
||||
public abstract void close() throws IOException;
|
||||
|
||||
/** Reads data for all terms in the next block; this
|
||||
* method should merely load the byte[] blob but not
|
||||
* decode, which is done in {@link #nextTerm}. */
|
||||
public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.codecs;
|
|||
import java.io.IOException;
|
||||
import java.io.Closeable;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
||||
|
@ -48,25 +49,31 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
|||
/** Called once after startup, before any terms have been
|
||||
* added. Implementations typically write a header to
|
||||
* the provided {@code termsOut}. */
|
||||
public abstract void start(IndexOutput termsOut) throws IOException;
|
||||
public abstract void init(IndexOutput termsOut) throws IOException;
|
||||
|
||||
/** Return a newly created empty TermState */
|
||||
public abstract BlockTermState newTermState() throws IOException;
|
||||
|
||||
/** Start a new term. Note that a matching call to {@link
|
||||
* #finishTerm(TermStats)} is done, only if the term has at least one
|
||||
* #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one
|
||||
* document. */
|
||||
public abstract void startTerm() throws IOException;
|
||||
|
||||
/** Flush count terms starting at start "backwards", as a
|
||||
* block. start is a negative offset from the end of the
|
||||
* terms stack, ie bigger start means further back in
|
||||
* the stack. */
|
||||
public abstract void flushTermsBlock(int start, int count) throws IOException;
|
||||
|
||||
/** Finishes the current term. The provided {@link
|
||||
* TermStats} contains the term's summary statistics. */
|
||||
public abstract void finishTerm(TermStats stats) throws IOException;
|
||||
* BlockTermState} contains the term's summary statistics,
|
||||
* and will holds metadata from PBF when returned */
|
||||
public abstract void finishTerm(BlockTermState state) throws IOException;
|
||||
|
||||
/** Called when the writing switches to another field. */
|
||||
public abstract void setField(FieldInfo fieldInfo);
|
||||
/**
|
||||
* Encode metadata as long[] and byte[]. {@code absolute} controls
|
||||
* whether current term is delta encoded according to latest term.
|
||||
*/
|
||||
public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
|
||||
|
||||
/**
|
||||
* Return the fixed length of longs,
|
||||
* called when the writing switches to another field. */
|
||||
public abstract int setField(FieldInfo fieldInfo);
|
||||
|
||||
@Override
|
||||
public abstract void close() throws IOException;
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -121,11 +122,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
long proxOffset;
|
||||
long skipOffset;
|
||||
|
||||
// Only used by the "primary" TermState -- clones don't
|
||||
// copy this (basically they are "transient"):
|
||||
ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...?
|
||||
byte[] bytes;
|
||||
|
||||
@Override
|
||||
public StandardTermState clone() {
|
||||
StandardTermState other = new StandardTermState();
|
||||
|
@ -140,11 +136,6 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
freqOffset = other.freqOffset;
|
||||
proxOffset = other.proxOffset;
|
||||
skipOffset = other.skipOffset;
|
||||
|
||||
// Do not copy bytes, bytesReader (else TermState is
|
||||
// very heavy, ie drags around the entire block's
|
||||
// byte[]). On seek back, if next() is in fact used
|
||||
// (rare!), they will be re-read from disk.
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -171,38 +162,18 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
/* Reads but does not decode the byte[] blob holding
|
||||
metadata for the current terms block */
|
||||
@Override
|
||||
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
final StandardTermState termState = (StandardTermState) _termState;
|
||||
|
||||
final int len = termsIn.readVInt();
|
||||
|
||||
// if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState);
|
||||
if (termState.bytes == null) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
termState.bytesReader = new ByteArrayDataInput();
|
||||
} else if (termState.bytes.length < len) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
|
||||
}
|
||||
|
||||
termsIn.readBytes(termState.bytes, 0, len);
|
||||
termState.bytesReader.reset(termState.bytes, 0, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
|
||||
public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
|
||||
throws IOException {
|
||||
final StandardTermState termState = (StandardTermState) _termState;
|
||||
// if (DEBUG) System.out.println("SPR: nextTerm seg=" + segment + " tbOrd=" + termState.termBlockOrd + " bytesReader.fp=" + termState.bytesReader.getPosition());
|
||||
final boolean isFirstTerm = termState.termBlockOrd == 0;
|
||||
|
||||
if (isFirstTerm) {
|
||||
termState.freqOffset = termState.bytesReader.readVLong();
|
||||
} else {
|
||||
termState.freqOffset += termState.bytesReader.readVLong();
|
||||
if (absolute) {
|
||||
termState.freqOffset = 0;
|
||||
termState.proxOffset = 0;
|
||||
}
|
||||
|
||||
termState.freqOffset += in.readVLong();
|
||||
/*
|
||||
if (DEBUG) {
|
||||
System.out.println(" dF=" + termState.docFreq);
|
||||
|
@ -212,7 +183,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
assert termState.freqOffset < freqIn.length();
|
||||
|
||||
if (termState.docFreq >= skipMinimum) {
|
||||
termState.skipOffset = termState.bytesReader.readVLong();
|
||||
termState.skipOffset = in.readVLong();
|
||||
// if (DEBUG) System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length());
|
||||
assert termState.freqOffset + termState.skipOffset < freqIn.length();
|
||||
} else {
|
||||
|
@ -220,11 +191,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
if (isFirstTerm) {
|
||||
termState.proxOffset = termState.bytesReader.readVLong();
|
||||
} else {
|
||||
termState.proxOffset += termState.bytesReader.readVLong();
|
||||
}
|
||||
termState.proxOffset += in.readVLong();
|
||||
// if (DEBUG) System.out.println(" proxFP=" + termState.proxOffset);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -152,11 +152,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
|
|||
// freq is always implicitly totalTermFreq in this case.
|
||||
int singletonDocID;
|
||||
|
||||
// Only used by the "primary" TermState -- clones don't
|
||||
// copy this (basically they are "transient"):
|
||||
ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...?
|
||||
byte[] bytes;
|
||||
|
||||
@Override
|
||||
public IntBlockTermState clone() {
|
||||
IntBlockTermState other = new IntBlockTermState();
|
||||
|
@ -174,11 +169,6 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
|
|||
lastPosBlockOffset = other.lastPosBlockOffset;
|
||||
skipOffset = other.skipOffset;
|
||||
singletonDocID = other.singletonDocID;
|
||||
|
||||
// Do not copy bytes, bytesReader (else TermState is
|
||||
// very heavy, ie drags around the entire block's
|
||||
// byte[]). On seek back, if next() is in fact used
|
||||
// (rare!), they will be re-read from disk.
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -197,78 +187,37 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
|
|||
IOUtils.close(docIn, posIn, payIn);
|
||||
}
|
||||
|
||||
/* Reads but does not decode the byte[] blob holding
|
||||
metadata for the current terms block */
|
||||
@Override
|
||||
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
|
||||
final IntBlockTermState termState = (IntBlockTermState) _termState;
|
||||
|
||||
final int numBytes = termsIn.readVInt();
|
||||
|
||||
if (termState.bytes == null) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
termState.bytesReader = new ByteArrayDataInput();
|
||||
} else if (termState.bytes.length < numBytes) {
|
||||
termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
}
|
||||
|
||||
termsIn.readBytes(termState.bytes, 0, numBytes);
|
||||
termState.bytesReader.reset(termState.bytes, 0, numBytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
|
||||
public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
|
||||
throws IOException {
|
||||
final IntBlockTermState termState = (IntBlockTermState) _termState;
|
||||
final boolean isFirstTerm = termState.termBlockOrd == 0;
|
||||
final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
final boolean fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
|
||||
final DataInput in = termState.bytesReader;
|
||||
if (isFirstTerm) {
|
||||
if (termState.docFreq == 1) {
|
||||
termState.singletonDocID = in.readVInt();
|
||||
termState.docStartFP = 0;
|
||||
} else {
|
||||
termState.singletonDocID = -1;
|
||||
termState.docStartFP = in.readVLong();
|
||||
}
|
||||
if (fieldHasPositions) {
|
||||
termState.posStartFP = in.readVLong();
|
||||
if (termState.totalTermFreq > BLOCK_SIZE) {
|
||||
termState.lastPosBlockOffset = in.readVLong();
|
||||
} else {
|
||||
termState.lastPosBlockOffset = -1;
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
|
||||
termState.payStartFP = in.readVLong();
|
||||
} else {
|
||||
termState.payStartFP = -1;
|
||||
}
|
||||
// nocommit: use old version
|
||||
if (absolute) {
|
||||
termState.docStartFP = 0;
|
||||
termState.posStartFP = 0;
|
||||
termState.payStartFP = 0;
|
||||
}
|
||||
termState.docStartFP += longs[0];
|
||||
if (fieldHasPositions) {
|
||||
termState.posStartFP += longs[1];
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
termState.payStartFP += longs[2];
|
||||
}
|
||||
}
|
||||
if (termState.docFreq == 1) {
|
||||
termState.singletonDocID = in.readVInt();
|
||||
} else {
|
||||
if (termState.docFreq == 1) {
|
||||
termState.singletonDocID = in.readVInt();
|
||||
termState.singletonDocID = -1;
|
||||
}
|
||||
if (fieldHasPositions) {
|
||||
if (termState.totalTermFreq > BLOCK_SIZE) {
|
||||
termState.lastPosBlockOffset = in.readVLong();
|
||||
} else {
|
||||
termState.singletonDocID = -1;
|
||||
termState.docStartFP += in.readVLong();
|
||||
}
|
||||
if (fieldHasPositions) {
|
||||
termState.posStartFP += in.readVLong();
|
||||
if (termState.totalTermFreq > BLOCK_SIZE) {
|
||||
termState.lastPosBlockOffset = in.readVLong();
|
||||
} else {
|
||||
termState.lastPosBlockOffset = -1;
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
|
||||
long delta = in.readVLong();
|
||||
if (termState.payStartFP == -1) {
|
||||
termState.payStartFP = delta;
|
||||
} else {
|
||||
termState.payStartFP += delta;
|
||||
}
|
||||
}
|
||||
termState.lastPosBlockOffset = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,14 +25,15 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -71,7 +72,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
final static IntBlockTermState emptyState = new IntBlockTermState();
|
||||
IntBlockTermState lastState;
|
||||
|
||||
// How current field indexes postings:
|
||||
private boolean fieldHasFreqs;
|
||||
|
@ -79,7 +81,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
// Holds starting file pointers for each term:
|
||||
// Holds starting file pointers for current term:
|
||||
private long docTermStartFP;
|
||||
private long posTermStartFP;
|
||||
private long payTermStartFP;
|
||||
|
@ -188,21 +190,50 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
this(state, PackedInts.COMPACT);
|
||||
}
|
||||
|
||||
private final static class IntBlockTermState extends BlockTermState {
|
||||
long docTermStartFP = 0;
|
||||
long posTermStartFP = 0;
|
||||
long payTermStartFP = 0;
|
||||
long skipOffset = -1;
|
||||
long lastPosBlockOffset = -1;
|
||||
int singletonDocID = -1;
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + " docStartFP=" + docTermStartFP + " posStartFP=" + posTermStartFP + " payStartFP=" + payTermStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
public IntBlockTermState newTermState() {
|
||||
return new IntBlockTermState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexOutput termsOut) throws IOException {
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// nocommit better name?
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||
lastState = emptyState;
|
||||
if (fieldHasPositions) {
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
return 3; // doc + pos + pay FP
|
||||
} else {
|
||||
return 2; // doc + pos FP
|
||||
}
|
||||
} else {
|
||||
return 1; // doc FP
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -348,37 +379,18 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long docStartFP;
|
||||
public final long posStartFP;
|
||||
public final long payStartFP;
|
||||
public final long skipOffset;
|
||||
public final long lastPosBlockOffset;
|
||||
public final int singletonDocID;
|
||||
|
||||
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
|
||||
this.docStartFP = docStartFP;
|
||||
this.posStartFP = posStartFP;
|
||||
this.payStartFP = payStartFP;
|
||||
this.skipOffset = skipOffset;
|
||||
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||
this.singletonDocID = singletonDocID;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
assert stats.docFreq > 0;
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState) _state;
|
||||
assert state.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||
assert state.docFreq == docCount: state.docFreq + " vs " + docCount;
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
// System.out.println("FPW.finishTerm docFreq=" + state.docFreq);
|
||||
// }
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -389,7 +401,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
|
||||
final int singletonDocID;
|
||||
if (stats.docFreq == 1) {
|
||||
if (state.docFreq == 1) {
|
||||
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
|
||||
singletonDocID = docDeltaBuffer[0];
|
||||
} else {
|
||||
|
@ -420,8 +432,8 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
// totalTermFreq is just total number of positions(or payloads, or offsets)
|
||||
// associated with current term.
|
||||
assert stats.totalTermFreq != -1;
|
||||
if (stats.totalTermFreq > BLOCK_SIZE) {
|
||||
assert state.totalTermFreq != -1;
|
||||
if (state.totalTermFreq > BLOCK_SIZE) {
|
||||
// record file offset for last pos in last block
|
||||
lastPosBlockOffset = posOut.getFilePointer() - posTermStartFP;
|
||||
} else {
|
||||
|
@ -486,7 +498,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
}
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
// System.out.println(" totalTermFreq=" + state.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
// }
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
|
@ -505,76 +517,48 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
// System.out.println(" no skip: docCount=" + docCount);
|
||||
// }
|
||||
}
|
||||
|
||||
long payStartFP;
|
||||
if (stats.totalTermFreq >= BLOCK_SIZE) {
|
||||
payStartFP = payTermStartFP;
|
||||
} else {
|
||||
payStartFP = -1;
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" payStartFP=" + payStartFP);
|
||||
// }
|
||||
|
||||
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
|
||||
state.docTermStartFP = docTermStartFP;
|
||||
state.posTermStartFP = posTermStartFP;
|
||||
state.payTermStartFP = payTermStartFP;
|
||||
state.singletonDocID = singletonDocID;
|
||||
state.skipOffset = skipOffset;
|
||||
state.lastPosBlockOffset = lastPosBlockOffset;
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = 0;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
// nocommit explain about the "don't care" values
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState)_state;
|
||||
if (absolute) {
|
||||
lastState = emptyState;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
long lastDocStartFP = 0;
|
||||
long lastPosStartFP = 0;
|
||||
long lastPayStartFP = 0;
|
||||
for(int idx=limit-count; idx<limit; idx++) {
|
||||
PendingTerm term = pendingTerms.get(idx);
|
||||
|
||||
if (term.singletonDocID == -1) {
|
||||
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||
lastDocStartFP = term.docStartFP;
|
||||
} else {
|
||||
bytesWriter.writeVInt(term.singletonDocID);
|
||||
}
|
||||
|
||||
if (fieldHasPositions) {
|
||||
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||
lastPosStartFP = term.posStartFP;
|
||||
if (term.lastPosBlockOffset != -1) {
|
||||
bytesWriter.writeVLong(term.lastPosBlockOffset);
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||
lastPayStartFP = term.payStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
if (term.skipOffset != -1) {
|
||||
bytesWriter.writeVLong(term.skipOffset);
|
||||
longs[0] = state.docTermStartFP - lastState.docTermStartFP;
|
||||
if (fieldHasPositions) {
|
||||
longs[1] = state.posTermStartFP - lastState.posTermStartFP;
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
longs[2] = state.payTermStartFP - lastState.payTermStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
if (state.singletonDocID != -1) {
|
||||
out.writeVInt(state.singletonDocID);
|
||||
}
|
||||
if (fieldHasPositions) {
|
||||
if (state.lastPosBlockOffset != -1) {
|
||||
out.writeVLong(state.lastPosBlockOffset);
|
||||
}
|
||||
}
|
||||
if (state.skipOffset != -1) {
|
||||
out.writeVLong(state.skipOffset);
|
||||
}
|
||||
lastState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
|||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -67,7 +69,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
*/
|
||||
final int maxSkipLevels = 10;
|
||||
final int totalNumDocs;
|
||||
IndexOutput termsOut;
|
||||
|
||||
IndexOptions indexOptions;
|
||||
boolean storePayloads;
|
||||
|
@ -81,6 +82,9 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
int lastPosition;
|
||||
int lastOffset;
|
||||
|
||||
final static StandardTermState emptyState = new StandardTermState();
|
||||
StandardTermState lastState;
|
||||
|
||||
// private String segment;
|
||||
|
||||
/** Creates a {@link Lucene40PostingsWriter}, with the
|
||||
|
@ -134,14 +138,19 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
public void init(IndexOutput termsOut) throws IOException {
|
||||
CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
|
||||
termsOut.writeInt(skipInterval); // write skipInterval
|
||||
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
|
||||
termsOut.writeInt(skipMinimum); // write skipMinimum
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState newTermState() {
|
||||
return new StandardTermState();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void startTerm() {
|
||||
freqStart = freqOut.getFilePointer();
|
||||
|
@ -159,7 +168,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
// Currently, this instance is re-used across fields, so
|
||||
// our parent calls setField whenever the field changes
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
//System.out.println("SPW: setField");
|
||||
/*
|
||||
if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) {
|
||||
|
@ -173,8 +182,10 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
storePayloads = fieldInfo.hasPayloads();
|
||||
lastState = emptyState;
|
||||
//System.out.println(" set init blockFreqStart=" + freqStart);
|
||||
//System.out.println(" set init blockProxStart=" + proxStart);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int lastDocID;
|
||||
|
@ -265,94 +276,48 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
public void finishDoc() {
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long freqStart;
|
||||
public final long proxStart;
|
||||
public final long skipOffset;
|
||||
|
||||
public PendingTerm(long freqStart, long proxStart, long skipOffset) {
|
||||
this.freqStart = freqStart;
|
||||
this.proxStart = proxStart;
|
||||
this.skipOffset = skipOffset;
|
||||
}
|
||||
private static class StandardTermState extends BlockTermState {
|
||||
public long freqStart;
|
||||
public long proxStart;
|
||||
public long skipOffset;
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
StandardTermState state = (StandardTermState)_state;
|
||||
// if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
|
||||
assert stats.docFreq > 0;
|
||||
assert state.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == df;
|
||||
|
||||
final long skipOffset;
|
||||
assert state.docFreq == df;
|
||||
state.freqStart = freqStart;
|
||||
state.proxStart = proxStart;
|
||||
if (df >= skipMinimum) {
|
||||
skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
|
||||
state.skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
|
||||
} else {
|
||||
skipOffset = -1;
|
||||
state.skipOffset = -1;
|
||||
}
|
||||
|
||||
pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset));
|
||||
|
||||
lastDocID = 0;
|
||||
df = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
//if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size());
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
StandardTermState state = (StandardTermState)_state;
|
||||
if (absolute) {
|
||||
lastState = emptyState;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
final PendingTerm firstTerm = pendingTerms.get(limit - count);
|
||||
// First term in block is abs coded:
|
||||
bytesWriter.writeVLong(firstTerm.freqStart);
|
||||
|
||||
if (firstTerm.skipOffset != -1) {
|
||||
assert firstTerm.skipOffset > 0;
|
||||
bytesWriter.writeVLong(firstTerm.skipOffset);
|
||||
out.writeVLong(state.freqStart - lastState.freqStart);
|
||||
if (state.skipOffset != -1) {
|
||||
assert state.skipOffset > 0;
|
||||
out.writeVLong(state.skipOffset);
|
||||
}
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(firstTerm.proxStart);
|
||||
out.writeVLong(state.proxStart - lastState.proxStart);
|
||||
}
|
||||
long lastFreqStart = firstTerm.freqStart;
|
||||
long lastProxStart = firstTerm.proxStart;
|
||||
for(int idx=limit-count+1; idx<limit; idx++) {
|
||||
final PendingTerm term = pendingTerms.get(idx);
|
||||
//if (DEBUG) System.out.println(" write term freqStart=" + term.freqStart);
|
||||
// The rest of the terms term are delta coded:
|
||||
bytesWriter.writeVLong(term.freqStart - lastFreqStart);
|
||||
lastFreqStart = term.freqStart;
|
||||
if (term.skipOffset != -1) {
|
||||
assert term.skipOffset > 0;
|
||||
bytesWriter.writeVLong(term.skipOffset);
|
||||
}
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(term.proxStart - lastProxStart);
|
||||
lastProxStart = term.proxStart;
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
lastState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -183,12 +183,37 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("MockRandomCodec: writing pulsing postings with totTFCutoff=" + totTFCutoff);
|
||||
}
|
||||
postingsWriter = new PulsingPostingsWriter(totTFCutoff, postingsWriter);
|
||||
postingsWriter = new PulsingPostingsWriter(state, totTFCutoff, postingsWriter);
|
||||
}
|
||||
|
||||
final FieldsConsumer fields;
|
||||
final int t1 = random.nextInt(2);
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
/*
|
||||
final int t1 = random.nextInt(4);
|
||||
if (t1 == 0) {
|
||||
boolean success = false;
|
||||
try {
|
||||
fields = new TempFSTTermsWriter(state, postingsWriter);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
postingsWriter.close();
|
||||
}
|
||||
}
|
||||
} else if (t1 == 1) {
|
||||
boolean success = false;
|
||||
try {
|
||||
fields = new TempFSTOrdTermsWriter(state, postingsWriter);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
postingsWriter.close();
|
||||
}
|
||||
}
|
||||
} else if (t1 == 2) {
|
||||
*/
|
||||
if (t1 == 0) {
|
||||
// Use BlockTree terms dict
|
||||
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
|
@ -322,12 +347,36 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff);
|
||||
}
|
||||
postingsReader = new PulsingPostingsReader(postingsReader);
|
||||
postingsReader = new PulsingPostingsReader(state, postingsReader);
|
||||
}
|
||||
|
||||
final FieldsProducer fields;
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
final int t1 = random.nextInt(2);
|
||||
/*
|
||||
final int t1 = random.nextInt(4);
|
||||
if (t1 == 0) {
|
||||
boolean success = false;
|
||||
try {
|
||||
fields = new TempFSTTermsReader(state, postingsReader);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
postingsReader.close();
|
||||
}
|
||||
}
|
||||
} else if (t1 == 1) {
|
||||
boolean success = false;
|
||||
try {
|
||||
fields = new TempFSTOrdTermsReader(state, postingsReader);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
postingsReader.close();
|
||||
}
|
||||
}
|
||||
} else if (t1 == 2) {
|
||||
*/
|
||||
if (t1 == 0) {
|
||||
// Use BlockTree terms dict
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("MockRandomCodec: reading BlockTree terms dict");
|
||||
|
|
|
@ -57,8 +57,8 @@ public final class NestedPulsingPostingsFormat extends PostingsFormat {
|
|||
try {
|
||||
docsWriter = new Lucene41PostingsWriter(state);
|
||||
|
||||
pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
|
||||
pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
|
||||
pulsingWriterInner = new PulsingPostingsWriter(state, 2, docsWriter);
|
||||
pulsingWriter = new PulsingPostingsWriter(state, 1, pulsingWriterInner);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter,
|
||||
BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
|
@ -78,8 +78,8 @@ public final class NestedPulsingPostingsFormat extends PostingsFormat {
|
|||
boolean success = false;
|
||||
try {
|
||||
docsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
pulsingReaderInner = new PulsingPostingsReader(docsReader);
|
||||
pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
|
||||
pulsingReaderInner = new PulsingPostingsReader(state, docsReader);
|
||||
pulsingReader = new PulsingPostingsReader(state, pulsingReaderInner);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.directory, state.fieldInfos, state.segmentInfo,
|
||||
pulsingReader,
|
||||
|
|
|
@ -169,7 +169,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
final PostingsFormat format;
|
||||
if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
format = PostingsFormat.forName("Lucene41");
|
||||
} else if ("TempRandom".equals(TEST_POSTINGSFORMAT)) {
|
||||
} else if ("MockRandom".equals(TEST_POSTINGSFORMAT)) {
|
||||
format = new MockRandomPostingsFormat(new Random(random.nextLong()));
|
||||
} else {
|
||||
format = PostingsFormat.forName(TEST_POSTINGSFORMAT);
|
||||
|
|
Loading…
Reference in New Issue