LUCENE-5029: remove block based API from PBF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1493494 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Han Jiang 2013-06-16 10:53:51 +00:00
parent 13df813541
commit 9a1ae3fe4a
7 changed files with 185 additions and 173 deletions

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.codecs.temp.TempTermState;
@ -56,7 +57,11 @@ public abstract class TempPostingsReaderBase implements Closeable {
public abstract TempTermState newTermState() throws IOException;
/** Actually decode metadata for next term */
public abstract void nextTerm(FieldInfo fieldInfo, TempTermState state) throws IOException;
// nocommit: remove the 'fieldInfo' ? I suppose for a given postingsPBR, this should be fixed?
public abstract void nextTerm(long[] longs, DataInput in, FieldInfo fieldInfo, TempTermState state) throws IOException;
/** Return the fixed length of longs */
public abstract int longsSize(FieldInfo fieldInfo);
/** Must fully consume state, since after this call that
* TermState may be reused. */
@ -69,9 +74,4 @@ public abstract class TempPostingsReaderBase implements Closeable {
@Override
public abstract void close() throws IOException;
/** Reads data for all terms in the next block; this
* method should merely load the byte[] blob but not
* decode, which is done in {@link #nextTerm}. */
public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, TempTermState termState) throws IOException;
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import java.io.Closeable;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfo;
@ -55,15 +56,12 @@ public abstract class TempPostingsWriterBase extends PostingsConsumer implements
* document. */
public abstract void startTerm() throws IOException;
/** Flush count terms starting at start "backwards", as a
* block. start is a negative offset from the end of the
* terms stack, ie bigger start means further back in
* the stack. */
public abstract void flushTermsBlock(int start, int count) throws IOException;
/** Finishes the current term. The provided {@link
* TermStats} contains the term's summary statistics. */
public abstract void finishTerm(TermStats stats) throws IOException;
public abstract void finishTerm(long[] longs, DataOutput out, TermStats stats) throws IOException;
/** Return the fixed length of longs */
public abstract int longsSize(FieldInfo fieldInfo);
/** Called when the writing switches to another field. */
public abstract void setField(FieldInfo fieldInfo);

View File

@ -26,6 +26,7 @@ import java.util.Comparator;
import java.util.Iterator;
import java.util.Locale;
import java.util.TreeMap;
import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
@ -622,6 +623,12 @@ public class TempBlockTermsReader extends FieldsProducer {
final TempTermState termState;
// metadata buffer, holding monotonical values
public long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
// Cumulative output so far
BytesRef outputPrefix;
@ -630,8 +637,9 @@ public class TempBlockTermsReader extends FieldsProducer {
public Frame(int ord) throws IOException {
this.ord = ord;
termState = postingsReader.newTermState();
termState.totalTermFreq = -1;
this.termState = postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.longs = new long[postingsReader.longsSize(fieldInfo)];
}
void loadNextFloorBlock() throws IOException {
@ -730,7 +738,16 @@ public class TempBlockTermsReader extends FieldsProducer {
termState.termBlockOrd = 0;
nextEnt = 0;
postingsReader.readTermsBlock(in, fieldInfo, termState);
// metadata
numBytes = in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
if (!isLastInFloor) {
// Sub-blocks of a single floor block are always
@ -785,9 +802,10 @@ public class TempBlockTermsReader extends FieldsProducer {
final int limit = getTermBlockOrd();
assert limit > 0;
// We must set/incr state.termCount because
// postings impl can look at this
termState.termBlockOrd = metaDataUpto;
if (metaDataUpto == 0) {
Arrays.fill(longs, 0);
}
final int longSize = longs.length;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@ -800,17 +818,21 @@ public class TempBlockTermsReader extends FieldsProducer {
// TODO: if docFreq were bulk decoded we could
// just skipN here:
// stats
termState.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < longSize; i++) {
longs[i] += bytesReader.readVLong();
}
postingsReader.nextTerm(longs, bytesReader, fieldInfo, termState);
postingsReader.nextTerm(fieldInfo, termState);
metaDataUpto++;
termState.termBlockOrd++;
}
termState.termBlockOrd = metaDataUpto;
}
}
@ -2300,10 +2322,17 @@ public class TempBlockTermsReader extends FieldsProducer {
final TempTermState state;
// metadata buffer, holding monotonical values
public long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
public Frame(int ord) throws IOException {
this.ord = ord;
state = postingsReader.newTermState();
state.totalTermFreq = -1;
this.state = postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.longs = new long[postingsReader.longsSize(fieldInfo)];
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@ -2401,7 +2430,17 @@ public class TempBlockTermsReader extends FieldsProducer {
// TODO: we could skip this if !hasTerms; but
// that's rare so won't help much
postingsReader.readTermsBlock(in, fieldInfo, state);
// metadata
numBytes = in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
@ -2587,9 +2626,10 @@ public class TempBlockTermsReader extends FieldsProducer {
final int limit = getTermBlockOrd();
assert limit > 0;
// We must set/incr state.termCount because
// postings impl can look at this
state.termBlockOrd = metaDataUpto;
if (metaDataUpto == 0) {
Arrays.fill(longs, 0);
}
final int longSize = longs.length;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@ -2602,17 +2642,21 @@ public class TempBlockTermsReader extends FieldsProducer {
// TODO: if docFreq were bulk decoded we could
// just skipN here:
// stats
state.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < longSize; i++) {
longs[i] += bytesReader.readVLong();
}
postingsReader.nextTerm(longs, bytesReader, fieldInfo, state);
postingsReader.nextTerm(fieldInfo, state);
metaDataUpto++;
state.termBlockOrd++;
}
state.termBlockOrd = metaDataUpto;
}
// Used only by assert

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Arrays;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
@ -482,6 +483,15 @@ public class TempBlockTermsWriter extends FieldsConsumer {
}
}
private static final class PendingMetaData {
public long[] longs;
public RAMOutputStream bytesWriter;
public PendingMetaData(int length) {
longs = new long[length];
bytesWriter = new RAMOutputStream();
}
}
final RAMOutputStream scratchBytes = new RAMOutputStream();
class TermsWriter extends TermsConsumer {
@ -936,8 +946,8 @@ public class TempBlockTermsWriter extends FieldsConsumer {
bytesWriter2.writeTo(out);
bytesWriter2.reset();
// Have postings writer write block
postingsWriter.flushTermsBlock(futureTermCount+termCount, termCount);
// Write term metadata block
flushTermsBlock(futureTermCount+termCount, termCount);
// Remove slice replaced by block:
slice.clear();
@ -957,6 +967,46 @@ public class TempBlockTermsWriter extends FieldsConsumer {
return new PendingBlock(prefix, startFP, termCount != 0, isFloor, floorLeadByte, subIndices);
}
/** Flush count terms starting at start "backwards", as a
* block. start is a negative offset from the end of the
* terms stack, ie bigger start means further back in
* the stack. */
void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
out.writeByte((byte) 0);
return;
}
assert start <= pendingMetaData.size();
assert count <= start;
final int limit = pendingMetaData.size() - start + count;
final int size = postingsWriter.longsSize(fieldInfo);
long[] lastLongs = new long[size];
Arrays.fill(lastLongs, 0);
for(int idx=limit-count; idx<limit; idx++) {
PendingMetaData meta = pendingMetaData.get(idx);
for (int pos = 0; pos < size; pos++) {
if (meta.longs[pos] < 0) {
// nocommit: this -1 padding is implicit (maybe we need javadocs, or better
// an API to tell PostingsBase that: every time you meet a 'don't care', just put -1 on it?
meta.longs[pos] = lastLongs[pos];
}
bytesWriter3.writeVLong(meta.longs[pos] - lastLongs[pos]);
}
lastLongs = meta.longs;
meta.bytesWriter.writeTo(bytesWriter3);
}
out.writeVInt((int) bytesWriter3.getFilePointer());
bytesWriter3.writeTo(out);
bytesWriter3.reset();
// Remove the terms we just wrote:
pendingMetaData.subList(limit-count, limit).clear();
}
TermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
@ -997,6 +1047,9 @@ public class TempBlockTermsWriter extends FieldsConsumer {
private final IntsRef scratchIntsRef = new IntsRef();
private final List<PendingMetaData> pendingMetaData = new ArrayList<PendingMetaData>();
private final RAMOutputStream bytesWriter3 = new RAMOutputStream();
@Override
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
@ -1004,8 +1057,11 @@ public class TempBlockTermsWriter extends FieldsConsumer {
//if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats));
postingsWriter.finishTerm(stats);
PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), stats);
PendingMetaData meta = new PendingMetaData(postingsWriter.longsSize(fieldInfo));
pending.add(term);
postingsWriter.finishTerm(meta.longs, meta.bytesWriter, stats);
pendingMetaData.add(meta);
numTerms++;
}

View File

@ -199,78 +199,40 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
IOUtils.close(docIn, posIn, payIn);
}
/* Reads but does not decode the byte[] blob holding
metadata for the current terms block */
@Override
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, TempTermState _termState) throws IOException {
final IntBlockTermState termState = (IntBlockTermState) _termState;
final int numBytes = termsIn.readVInt();
if (termState.bytes == null) {
termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
termState.bytesReader = new ByteArrayDataInput();
} else if (termState.bytes.length < numBytes) {
termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
public int longsSize(FieldInfo fieldInfo) {
final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
if (fieldHasPositions) {
return 3;
} else {
return 1;
}
termsIn.readBytes(termState.bytes, 0, numBytes);
termState.bytesReader.reset(termState.bytes, 0, numBytes);
}
@Override
public void nextTerm(FieldInfo fieldInfo, TempTermState _termState)
public void nextTerm(long[] longs, DataInput in, FieldInfo fieldInfo, TempTermState _termState)
throws IOException {
final IntBlockTermState termState = (IntBlockTermState) _termState;
final boolean isFirstTerm = termState.termBlockOrd == 0;
final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean fieldHasPayloads = fieldInfo.hasPayloads();
final DataInput in = termState.bytesReader;
if (isFirstTerm) {
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
termState.docStartFP = 0;
} else {
termState.singletonDocID = -1;
termState.docStartFP = in.readVLong();
}
if (fieldHasPositions) {
termState.posStartFP = in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
termState.lastPosBlockOffset = in.readVLong();
} else {
termState.lastPosBlockOffset = -1;
}
if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
termState.payStartFP = in.readVLong();
} else {
termState.payStartFP = -1;
}
}
termState.docStartFP = longs[0];
if (fieldHasPositions) {
termState.posStartFP = longs[1];
termState.payStartFP = longs[2];
}
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
} else {
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
termState.singletonDocID = -1;
}
if (fieldHasPositions) {
if (termState.totalTermFreq > BLOCK_SIZE) {
termState.lastPosBlockOffset = in.readVLong();
} else {
termState.singletonDocID = -1;
termState.docStartFP += in.readVLong();
}
if (fieldHasPositions) {
termState.posStartFP += in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
termState.lastPosBlockOffset = in.readVLong();
} else {
termState.lastPosBlockOffset = -1;
}
if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) {
long delta = in.readVLong();
if (termState.payStartFP == -1) {
termState.payStartFP = delta;
} else {
termState.payStartFP += delta;
}
}
termState.lastPosBlockOffset = -1;
}
}

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
@ -73,8 +74,6 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
final IndexOutput posOut;
final IndexOutput payOut;
private IndexOutput termsOut;
// How current field indexes postings:
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
@ -192,7 +191,6 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(BLOCK_SIZE);
}
@ -350,29 +348,17 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
}
}
private static class PendingTerm {
public final long docStartFP;
public final long posStartFP;
public final long payStartFP;
public final long skipOffset;
public final long lastPosBlockOffset;
public final int singletonDocID;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
this.singletonDocID = singletonDocID;
public int longsSize(FieldInfo info) {
if (fieldHasPositions) {
return 3; // doc + pos + pay FP
} else {
return 1; // docFP
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
public void finishTerm(long[] longs, DataOutput out, TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
@ -514,71 +500,34 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
} else {
payStartFP = -1;
}
// if (DEBUG) {
// System.out.println(" payStartFP=" + payStartFP);
// }
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
// write metadata
longs[0] = docTermStartFP;
if (fieldHasPositions) {
longs[1] = posTermStartFP;
longs[2] = payStartFP;
}
if (singletonDocID != -1) {
out.writeVInt(singletonDocID);
}
if (fieldHasPositions) {
if (lastPosBlockOffset != -1) {
out.writeVLong(lastPosBlockOffset);
}
}
if (skipOffset != -1) {
out.writeVLong(skipOffset);
}
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
@Override
public void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
assert start <= pendingTerms.size();
assert count <= start;
final int limit = pendingTerms.size() - start + count;
long lastDocStartFP = 0;
long lastPosStartFP = 0;
long lastPayStartFP = 0;
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
if (term.singletonDocID == -1) {
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP;
} else {
bytesWriter.writeVInt(term.singletonDocID);
}
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
lastPosStartFP = term.posStartFP;
if (term.lastPosBlockOffset != -1) {
bytesWriter.writeVLong(term.lastPosBlockOffset);
}
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
lastPayStartFP = term.payStartFP;
}
}
if (term.skipOffset != -1) {
bytesWriter.writeVLong(term.skipOffset);
}
}
termsOut.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(termsOut);
bytesWriter.reset();
// Remove the terms we just wrote:
pendingTerms.subList(limit-count, limit).clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, posOut, payOut);

View File

@ -16,8 +16,11 @@ package org.apache.lucene.codecs.temp;
* limitations under the License.
*/
import java.util.Arrays;
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
/**
* Holds all state required for {@link PostingsReaderBase}