mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-2722: fix sep codec to store less in the terms dict
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029012 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
096538ed82
commit
7a5901805b
@ -167,6 +167,25 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
|
||||
assert upto < blockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
|
||||
if (absolute) {
|
||||
fp = indexIn.readVLong();
|
||||
upto = indexIn.next();
|
||||
} else {
|
||||
final long delta = indexIn.readVLong();
|
||||
if (delta == 0) {
|
||||
// same block
|
||||
upto += indexIn.next();
|
||||
} else {
|
||||
// new block
|
||||
fp += delta;
|
||||
upto = indexIn.next();
|
||||
}
|
||||
}
|
||||
assert upto < blockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(final IntIndexInput.Reader other) throws IOException {
|
||||
((Reader) other).seek(fp, upto);
|
||||
|
@ -83,11 +83,30 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
|
||||
// same block
|
||||
indexOut.writeVLong(0);
|
||||
assert upto >= lastUpto;
|
||||
indexOut.writeVLong(upto - lastUpto);
|
||||
indexOut.writeVInt(upto - lastUpto);
|
||||
} else {
|
||||
// new block
|
||||
indexOut.writeVLong(fp - lastFP);
|
||||
indexOut.writeVLong(upto);
|
||||
indexOut.writeVInt(upto);
|
||||
}
|
||||
lastUpto = upto;
|
||||
lastFP = fp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
|
||||
if (absolute) {
|
||||
indexOut.writeVLong(fp);
|
||||
indexOut.write(upto);
|
||||
} else if (fp == lastFP) {
|
||||
// same block
|
||||
indexOut.writeVLong(0);
|
||||
assert upto >= lastUpto;
|
||||
indexOut.write(upto - lastUpto);
|
||||
} else {
|
||||
// new block
|
||||
indexOut.writeVLong(fp - lastFP);
|
||||
indexOut.write(upto);
|
||||
}
|
||||
lastUpto = upto;
|
||||
lastFP = fp;
|
||||
|
@ -188,6 +188,24 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
|
||||
//assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
|
||||
if (absolute) {
|
||||
fp = indexIn.readVLong();
|
||||
upto = indexIn.next()&0xFF;
|
||||
} else {
|
||||
final long delta = indexIn.readVLong();
|
||||
if (delta == 0) {
|
||||
// same block
|
||||
upto = indexIn.next()&0xFF;
|
||||
} else {
|
||||
// new block
|
||||
fp += delta;
|
||||
upto = indexIn.next()&0xFF;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize;
|
||||
|
@ -103,6 +103,26 @@ public abstract class VariableIntBlockIndexOutput extends IntIndexOutput {
|
||||
lastUpto = upto;
|
||||
lastFP = fp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
|
||||
assert upto >= 0;
|
||||
if (absolute) {
|
||||
indexOut.writeVLong(fp);
|
||||
indexOut.write(upto);
|
||||
} else if (fp == lastFP) {
|
||||
// same block
|
||||
indexOut.writeVLong(0);
|
||||
assert upto >= lastUpto;
|
||||
indexOut.write(upto);
|
||||
} else {
|
||||
// new block
|
||||
indexOut.writeVLong(fp - lastFP);
|
||||
indexOut.write(upto);
|
||||
}
|
||||
lastUpto = upto;
|
||||
lastFP = fp;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -41,6 +41,8 @@ public abstract class IntIndexInput implements Closeable {
|
||||
|
||||
public abstract void read(IndexInput indexIn, boolean absolute) throws IOException;
|
||||
|
||||
public abstract void read(IntIndexInput.Reader indexIn, boolean absolute) throws IOException;
|
||||
|
||||
/** Seeks primary stream to the last read offset */
|
||||
public abstract void seek(IntIndexInput.Reader stream) throws IOException;
|
||||
|
||||
@ -54,6 +56,18 @@ public abstract class IntIndexInput implements Closeable {
|
||||
/** Reads next single int */
|
||||
public abstract int next() throws IOException;
|
||||
|
||||
/** Encodes as 1 or 2 ints, and can only use 61 of the 64
|
||||
* long bits. */
|
||||
public long readVLong() throws IOException {
|
||||
final int v = next();
|
||||
if ((v & 1) == 0) {
|
||||
return v >> 1;
|
||||
} else {
|
||||
final long v2 = next();
|
||||
return (v2 << 30) | (v >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
/** Reads next chunk of ints */
|
||||
private IntsRef bulkResult;
|
||||
|
||||
|
@ -34,9 +34,27 @@ import java.io.Closeable;
|
||||
* @lucene.experimental */
|
||||
public abstract class IntIndexOutput implements Closeable {
|
||||
|
||||
/** Write an int to the primary file */
|
||||
/** Write an int to the primary file. The value must be
|
||||
* >= 0. */
|
||||
public abstract void write(int v) throws IOException;
|
||||
|
||||
public static final long MAX_SINGLE_INT_VLONG = Integer.MAX_VALUE - (1<<30);
|
||||
public static final long MAX_VLONG = Long.MAX_VALUE - (1L<<62) - (1L<<61);
|
||||
|
||||
/** Encodes as 1 or 2 ints, and can only use 61 of the 64
|
||||
* long bits. */
|
||||
public void writeVLong(long v) throws IOException {
|
||||
assert v >= 0: "v=" + v;
|
||||
assert v < MAX_VLONG: "v=" + v;
|
||||
// we cannot pass a negative int
|
||||
if (v <= MAX_SINGLE_INT_VLONG) {
|
||||
write(((int) v)<<1);
|
||||
} else {
|
||||
write(((int) ((v & MAX_SINGLE_INT_VLONG))<<1) | 1);
|
||||
write(((int) (v >> 30)));
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class Index {
|
||||
|
||||
/** Internally records the current location */
|
||||
@ -46,8 +64,10 @@ public abstract class IntIndexOutput implements Closeable {
|
||||
public abstract void set(Index other) throws IOException;
|
||||
|
||||
/** Writes "location" of current output pointer of primary
|
||||
* output to different output (out) */
|
||||
* output to different output (out) */
|
||||
public abstract void write(IndexOutput indexOut, boolean absolute) throws IOException;
|
||||
|
||||
public abstract void write(IntIndexOutput indexOut, boolean absolute) throws IOException;
|
||||
}
|
||||
|
||||
/** If you are indexing the primary output file, call
|
||||
|
@ -130,21 +130,14 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
||||
}
|
||||
|
||||
private static class SepTermState extends TermState {
|
||||
// We store only the seek point to the docs file because
|
||||
// the rest of the info (freqIndex, posIndex, etc.) is
|
||||
// stored in the docs file:
|
||||
IntIndexInput.Index docIndex;
|
||||
IntIndexInput.Index freqIndex;
|
||||
IntIndexInput.Index posIndex;
|
||||
long skipOffset;
|
||||
long payloadOffset;
|
||||
|
||||
public Object clone() {
|
||||
SepTermState other = (SepTermState) super.clone();
|
||||
other.docIndex = (IntIndexInput.Index) docIndex.clone();
|
||||
if (freqIndex != null) {
|
||||
other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
|
||||
}
|
||||
if (posIndex != null) {
|
||||
other.posIndex = (IntIndexInput.Index) posIndex.clone();
|
||||
}
|
||||
return other;
|
||||
}
|
||||
|
||||
@ -152,22 +145,6 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
||||
super.copy(_other);
|
||||
SepTermState other = (SepTermState) _other;
|
||||
docIndex.set(other.docIndex);
|
||||
if (other.posIndex != null) {
|
||||
if (posIndex == null) {
|
||||
posIndex = (IntIndexInput.Index) other.posIndex.clone();
|
||||
} else {
|
||||
posIndex.set(other.posIndex);
|
||||
}
|
||||
}
|
||||
if (other.freqIndex != null) {
|
||||
if (freqIndex == null) {
|
||||
freqIndex = (IntIndexInput.Index) other.freqIndex.clone();
|
||||
} else {
|
||||
freqIndex.set(other.freqIndex);
|
||||
}
|
||||
}
|
||||
skipOffset = other.skipOffset;
|
||||
payloadOffset = other.payloadOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -184,39 +161,8 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
|
||||
// read freq index
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
if (termState.freqIndex == null) {
|
||||
assert isIndexTerm;
|
||||
termState.freqIndex = freqIn.index();
|
||||
termState.posIndex = posIn.index();
|
||||
}
|
||||
termState.freqIndex.read(termsIn, isIndexTerm);
|
||||
}
|
||||
|
||||
// read doc index
|
||||
termState.docIndex.read(termsIn, isIndexTerm);
|
||||
|
||||
// read skip index
|
||||
if (isIndexTerm) {
|
||||
termState.skipOffset = termsIn.readVLong();
|
||||
} else if (termState.docFreq >= skipInterval) {
|
||||
termState.skipOffset += termsIn.readVLong();
|
||||
}
|
||||
|
||||
// read pos, payload index
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
termState.posIndex.read(termsIn, isIndexTerm);
|
||||
final long v = termsIn.readVLong();
|
||||
if (isIndexTerm) {
|
||||
termState.payloadOffset = v;
|
||||
} else {
|
||||
termState.payloadOffset += v;
|
||||
}
|
||||
}
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
|
||||
((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -311,14 +257,18 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
||||
docIndex.set(termState.docIndex);
|
||||
docIndex.seek(docReader);
|
||||
|
||||
skipOffset = termState.skipOffset;
|
||||
|
||||
if (!omitTF) {
|
||||
freqIndex.set(termState.freqIndex);
|
||||
freqIndex.read(docReader, true);
|
||||
freqIndex.seek(freqReader);
|
||||
|
||||
posIndex.read(docReader, true);
|
||||
// skip payload offset
|
||||
docReader.readVLong();
|
||||
} else {
|
||||
freq = 1;
|
||||
}
|
||||
skipOffset = docReader.readVLong();
|
||||
|
||||
docFreq = termState.docFreq;
|
||||
count = 0;
|
||||
doc = 0;
|
||||
@ -498,17 +448,15 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
||||
docIndex.set(termState.docIndex);
|
||||
docIndex.seek(docReader);
|
||||
|
||||
freqIndex.set(termState.freqIndex);
|
||||
freqIndex.read(docReader, true);
|
||||
freqIndex.seek(freqReader);
|
||||
|
||||
posIndex.set(termState.posIndex);
|
||||
posIndex.read(docReader, true);
|
||||
posSeekPending = true;
|
||||
//posIndex.seek(posReader);
|
||||
payloadPending = false;
|
||||
|
||||
skipOffset = termState.skipOffset;
|
||||
payloadOffset = termState.payloadOffset;
|
||||
//payloadIn.seek(payloadOffset);
|
||||
payloadOffset = docReader.readVLong();
|
||||
skipOffset = docReader.readVLong();
|
||||
|
||||
docFreq = termState.docFreq;
|
||||
count = 0;
|
||||
|
@ -79,6 +79,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||
long lastPayloadStart;
|
||||
int lastDocID;
|
||||
int df;
|
||||
private boolean firstDoc;
|
||||
|
||||
public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
|
||||
super();
|
||||
@ -147,6 +148,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||
payloadStart = payloadOut.getFilePointer();
|
||||
lastPayloadLength = -1;
|
||||
}
|
||||
firstDoc = true;
|
||||
skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
|
||||
}
|
||||
|
||||
@ -169,6 +171,20 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
|
||||
if (firstDoc) {
|
||||
// TODO: we are writing absolute file pointers below,
|
||||
// which is wasteful. It'd be better compression to
|
||||
// write the "baseline" into each indexed term, then
|
||||
// write only the delta here.
|
||||
if (!omitTF) {
|
||||
freqIndex.write(docOut, true);
|
||||
posIndex.write(docOut, true);
|
||||
docOut.writeVLong(payloadStart);
|
||||
}
|
||||
docOut.writeVLong(skipOut.getFilePointer());
|
||||
firstDoc = false;
|
||||
}
|
||||
|
||||
final int delta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (df > 0 && delta <= 0)) {
|
||||
@ -229,42 +245,16 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
|
||||
long skipPos = skipOut.getFilePointer();
|
||||
|
||||
// TODO: -- wasteful we are counting this in two places?
|
||||
assert docCount > 0;
|
||||
assert docCount == df;
|
||||
|
||||
// TODO: -- only do this if once (consolidate the
|
||||
// conditional things that are written)
|
||||
if (!omitTF) {
|
||||
freqIndex.write(termsOut, isIndexTerm);
|
||||
}
|
||||
docIndex.write(termsOut, isIndexTerm);
|
||||
|
||||
if (df >= skipInterval) {
|
||||
skipListWriter.writeSkip(skipOut);
|
||||
}
|
||||
|
||||
if (isIndexTerm) {
|
||||
termsOut.writeVLong(skipPos);
|
||||
lastSkipStart = skipPos;
|
||||
} else if (df >= skipInterval) {
|
||||
termsOut.writeVLong(skipPos-lastSkipStart);
|
||||
lastSkipStart = skipPos;
|
||||
}
|
||||
|
||||
if (!omitTF) {
|
||||
posIndex.write(termsOut, isIndexTerm);
|
||||
if (isIndexTerm) {
|
||||
// Write absolute at seek points
|
||||
termsOut.writeVLong(payloadStart);
|
||||
} else {
|
||||
termsOut.writeVLong(payloadStart-lastPayloadStart);
|
||||
}
|
||||
lastPayloadStart = payloadStart;
|
||||
}
|
||||
|
||||
lastDocID = 0;
|
||||
df = 0;
|
||||
}
|
||||
|
@ -80,6 +80,16 @@ public class MockSingleIntIndexInput extends IntIndexInput {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(IntIndexInput.Reader indexIn, boolean absolute)
|
||||
throws IOException {
|
||||
if (absolute) {
|
||||
fp = indexIn.readVLong();
|
||||
} else {
|
||||
fp += indexIn.readVLong();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(IntIndexInput.Index other) {
|
||||
fp = ((Index) other).fp;
|
||||
|
@ -76,6 +76,18 @@ public class MockSingleIntIndexOutput extends IntIndexOutput {
|
||||
}
|
||||
lastFP = fp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(IntIndexOutput indexOut, boolean absolute)
|
||||
throws IOException {
|
||||
if (absolute) {
|
||||
indexOut.writeVLong(fp);
|
||||
} else {
|
||||
indexOut.writeVLong(fp - lastFP);
|
||||
}
|
||||
lastFP = fp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Long.toString(fp);
|
||||
|
Loading…
x
Reference in New Issue
Block a user