LUCENE-2722: fix sep codec to store less in the terms dict

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029012 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-10-30 10:17:20 +00:00
parent 096538ed82
commit 7a5901805b
10 changed files with 168 additions and 98 deletions

View File

@ -167,6 +167,25 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
assert upto < blockSize;
}
@Override
public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.next();
} else {
final long delta = indexIn.readVLong();
if (delta == 0) {
// same block
upto += indexIn.next();
} else {
// new block
fp += delta;
upto = indexIn.next();
}
}
assert upto < blockSize;
}
@Override
public void seek(final IntIndexInput.Reader other) throws IOException {
((Reader) other).seek(fp, upto);

View File

@ -83,11 +83,30 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
// same block
indexOut.writeVLong(0);
assert upto >= lastUpto;
indexOut.writeVLong(upto - lastUpto);
indexOut.writeVInt(upto - lastUpto);
} else {
// new block
indexOut.writeVLong(fp - lastFP);
indexOut.writeVLong(upto);
indexOut.writeVInt(upto);
}
lastUpto = upto;
lastFP = fp;
}
@Override
public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
if (absolute) {
indexOut.writeVLong(fp);
indexOut.write(upto);
} else if (fp == lastFP) {
// same block
indexOut.writeVLong(0);
assert upto >= lastUpto;
indexOut.write(upto - lastUpto);
} else {
// new block
indexOut.writeVLong(fp - lastFP);
indexOut.write(upto);
}
lastUpto = upto;
lastFP = fp;

View File

@ -188,6 +188,24 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
//assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize;
}
@Override
public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.next()&0xFF;
} else {
final long delta = indexIn.readVLong();
if (delta == 0) {
// same block
upto = indexIn.next()&0xFF;
} else {
// new block
fp += delta;
upto = indexIn.next()&0xFF;
}
}
}
@Override
public String toString() {
return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize;

View File

@ -103,6 +103,26 @@ public abstract class VariableIntBlockIndexOutput extends IntIndexOutput {
lastUpto = upto;
lastFP = fp;
}
@Override
public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
assert upto >= 0;
if (absolute) {
indexOut.writeVLong(fp);
indexOut.write(upto);
} else if (fp == lastFP) {
// same block
indexOut.writeVLong(0);
assert upto >= lastUpto;
indexOut.write(upto);
} else {
// new block
indexOut.writeVLong(fp - lastFP);
indexOut.write(upto);
}
lastUpto = upto;
lastFP = fp;
}
}
@Override

View File

@ -41,6 +41,8 @@ public abstract class IntIndexInput implements Closeable {
public abstract void read(IndexInput indexIn, boolean absolute) throws IOException;
public abstract void read(IntIndexInput.Reader indexIn, boolean absolute) throws IOException;
/** Seeks primary stream to the last read offset */
public abstract void seek(IntIndexInput.Reader stream) throws IOException;
@ -54,6 +56,18 @@ public abstract class IntIndexInput implements Closeable {
/** Reads next single int */
public abstract int next() throws IOException;
/** Encodes as 1 or 2 ints, and can only use 61 of the 64
* long bits. */
public long readVLong() throws IOException {
final int v = next();
if ((v & 1) == 0) {
return v >> 1;
} else {
final long v2 = next();
return (v2 << 30) | (v >> 1);
}
}
/** Reads next chunk of ints */
private IntsRef bulkResult;

View File

@ -34,9 +34,27 @@ import java.io.Closeable;
* @lucene.experimental */
public abstract class IntIndexOutput implements Closeable {
/** Write an int to the primary file */
/** Write an int to the primary file. The value must be
* >= 0. */
public abstract void write(int v) throws IOException;
public static final long MAX_SINGLE_INT_VLONG = Integer.MAX_VALUE - (1<<30);
public static final long MAX_VLONG = Long.MAX_VALUE - (1L<<62) - (1L<<61);
/** Encodes as 1 or 2 ints, and can only use 61 of the 64
* long bits. */
public void writeVLong(long v) throws IOException {
assert v >= 0: "v=" + v;
assert v < MAX_VLONG: "v=" + v;
// we cannot pass a negative int
if (v <= MAX_SINGLE_INT_VLONG) {
write(((int) v)<<1);
} else {
write(((int) ((v & MAX_SINGLE_INT_VLONG))<<1) | 1);
write(((int) (v >> 30)));
}
}
public abstract static class Index {
/** Internally records the current location */
@ -46,8 +64,10 @@ public abstract class IntIndexOutput implements Closeable {
public abstract void set(Index other) throws IOException;
/** Writes "location" of current output pointer of primary
* output to different output (out) */
* output to different output (out) */
public abstract void write(IndexOutput indexOut, boolean absolute) throws IOException;
public abstract void write(IntIndexOutput indexOut, boolean absolute) throws IOException;
}
/** If you are indexing the primary output file, call

View File

@ -130,21 +130,14 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
private static class SepTermState extends TermState {
// We store only the seek point to the docs file because
// the rest of the info (freqIndex, posIndex, etc.) is
// stored in the docs file:
IntIndexInput.Index docIndex;
IntIndexInput.Index freqIndex;
IntIndexInput.Index posIndex;
long skipOffset;
long payloadOffset;
public Object clone() {
SepTermState other = (SepTermState) super.clone();
other.docIndex = (IntIndexInput.Index) docIndex.clone();
if (freqIndex != null) {
other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
}
if (posIndex != null) {
other.posIndex = (IntIndexInput.Index) posIndex.clone();
}
return other;
}
@ -152,22 +145,6 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
super.copy(_other);
SepTermState other = (SepTermState) _other;
docIndex.set(other.docIndex);
if (other.posIndex != null) {
if (posIndex == null) {
posIndex = (IntIndexInput.Index) other.posIndex.clone();
} else {
posIndex.set(other.posIndex);
}
}
if (other.freqIndex != null) {
if (freqIndex == null) {
freqIndex = (IntIndexInput.Index) other.freqIndex.clone();
} else {
freqIndex.set(other.freqIndex);
}
}
skipOffset = other.skipOffset;
payloadOffset = other.payloadOffset;
}
@Override
@ -184,39 +161,8 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
@Override
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
final SepTermState termState = (SepTermState) _termState;
// read freq index
if (!fieldInfo.omitTermFreqAndPositions) {
if (termState.freqIndex == null) {
assert isIndexTerm;
termState.freqIndex = freqIn.index();
termState.posIndex = posIn.index();
}
termState.freqIndex.read(termsIn, isIndexTerm);
}
// read doc index
termState.docIndex.read(termsIn, isIndexTerm);
// read skip index
if (isIndexTerm) {
termState.skipOffset = termsIn.readVLong();
} else if (termState.docFreq >= skipInterval) {
termState.skipOffset += termsIn.readVLong();
}
// read pos, payload index
if (!fieldInfo.omitTermFreqAndPositions) {
termState.posIndex.read(termsIn, isIndexTerm);
final long v = termsIn.readVLong();
if (isIndexTerm) {
termState.payloadOffset = v;
} else {
termState.payloadOffset += v;
}
}
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
}
@Override
@ -311,14 +257,18 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
docIndex.set(termState.docIndex);
docIndex.seek(docReader);
skipOffset = termState.skipOffset;
if (!omitTF) {
freqIndex.set(termState.freqIndex);
freqIndex.read(docReader, true);
freqIndex.seek(freqReader);
posIndex.read(docReader, true);
// skip payload offset
docReader.readVLong();
} else {
freq = 1;
}
skipOffset = docReader.readVLong();
docFreq = termState.docFreq;
count = 0;
doc = 0;
@ -498,17 +448,15 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
docIndex.set(termState.docIndex);
docIndex.seek(docReader);
freqIndex.set(termState.freqIndex);
freqIndex.read(docReader, true);
freqIndex.seek(freqReader);
posIndex.set(termState.posIndex);
posIndex.read(docReader, true);
posSeekPending = true;
//posIndex.seek(posReader);
payloadPending = false;
skipOffset = termState.skipOffset;
payloadOffset = termState.payloadOffset;
//payloadIn.seek(payloadOffset);
payloadOffset = docReader.readVLong();
skipOffset = docReader.readVLong();
docFreq = termState.docFreq;
count = 0;

View File

@ -79,6 +79,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
long lastPayloadStart;
int lastDocID;
int df;
private boolean firstDoc;
public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
super();
@ -147,6 +148,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
payloadStart = payloadOut.getFilePointer();
lastPayloadLength = -1;
}
firstDoc = true;
skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
}
@ -169,6 +171,20 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (firstDoc) {
// TODO: we are writing absolute file pointers below,
// which is wasteful. It'd be better compression to
// write the "baseline" into each indexed term, then
// write only the delta here.
if (!omitTF) {
freqIndex.write(docOut, true);
posIndex.write(docOut, true);
docOut.writeVLong(payloadStart);
}
docOut.writeVLong(skipOut.getFilePointer());
firstDoc = false;
}
final int delta = docID - lastDocID;
if (docID < 0 || (df > 0 && delta <= 0)) {
@ -229,42 +245,16 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
long skipPos = skipOut.getFilePointer();
// TODO: -- wasteful we are counting this in two places?
assert docCount > 0;
assert docCount == df;
// TODO: -- only do this if once (consolidate the
// conditional things that are written)
if (!omitTF) {
freqIndex.write(termsOut, isIndexTerm);
}
docIndex.write(termsOut, isIndexTerm);
if (df >= skipInterval) {
skipListWriter.writeSkip(skipOut);
}
if (isIndexTerm) {
termsOut.writeVLong(skipPos);
lastSkipStart = skipPos;
} else if (df >= skipInterval) {
termsOut.writeVLong(skipPos-lastSkipStart);
lastSkipStart = skipPos;
}
if (!omitTF) {
posIndex.write(termsOut, isIndexTerm);
if (isIndexTerm) {
// Write absolute at seek points
termsOut.writeVLong(payloadStart);
} else {
termsOut.writeVLong(payloadStart-lastPayloadStart);
}
lastPayloadStart = payloadStart;
}
lastDocID = 0;
df = 0;
}

View File

@ -80,6 +80,16 @@ public class MockSingleIntIndexInput extends IntIndexInput {
}
}
@Override
public void read(IntIndexInput.Reader indexIn, boolean absolute)
throws IOException {
if (absolute) {
fp = indexIn.readVLong();
} else {
fp += indexIn.readVLong();
}
}
@Override
public void set(IntIndexInput.Index other) {
fp = ((Index) other).fp;

View File

@ -76,6 +76,18 @@ public class MockSingleIntIndexOutput extends IntIndexOutput {
}
lastFP = fp;
}
@Override
public void write(IntIndexOutput indexOut, boolean absolute)
throws IOException {
if (absolute) {
indexOut.writeVLong(fp);
} else {
indexOut.writeVLong(fp - lastFP);
}
lastFP = fp;
}
@Override
public String toString() {
return Long.toString(fp);