mirror of https://github.com/apache/lucene.git
LUCENE-4702: Reduce terms dictionary compression overhead. (#1216)
Changes include: - Removed LZ4 compression of suffix lengths which didn't save much space anyway. - For stats, LZ4 was only really used for run-length compression of terms whose docFreq is 1. This has been replaced by explicit run-length compression. - Since we only use LZ4 for suffix bytes if the compression ration is < 75%, we now only try LZ4 out if the average suffix length is greater than 6, in order to reduce index-time overhead.
This commit is contained in:
parent
4773574578
commit
6eb8834a57
|
@ -509,6 +509,39 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
static final BytesRef EMPTY_BYTES_REF = new BytesRef();
|
static final BytesRef EMPTY_BYTES_REF = new BytesRef();
|
||||||
|
|
||||||
|
private static class StatsWriter {
|
||||||
|
|
||||||
|
private final DataOutput out;
|
||||||
|
private final boolean hasFreqs;
|
||||||
|
private int singletonCount;
|
||||||
|
|
||||||
|
StatsWriter(DataOutput out, boolean hasFreqs) {
|
||||||
|
this.out = out;
|
||||||
|
this.hasFreqs = hasFreqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
void add(int df, long ttf) throws IOException {
|
||||||
|
// Singletons (DF==1, TTF==1) are run-length encoded
|
||||||
|
if (df == 1 && (hasFreqs == false || ttf == 1)) {
|
||||||
|
singletonCount++;
|
||||||
|
} else {
|
||||||
|
finish();
|
||||||
|
out.writeVInt(df << 1);
|
||||||
|
if (hasFreqs) {
|
||||||
|
out.writeVLong(ttf - df);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void finish() throws IOException {
|
||||||
|
if (singletonCount > 0) {
|
||||||
|
out.writeVInt(((singletonCount - 1) << 1) | 1);
|
||||||
|
singletonCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
class TermsWriter {
|
class TermsWriter {
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
private long numTerms;
|
private long numTerms;
|
||||||
|
@ -700,6 +733,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
if (isLeafBlock) {
|
if (isLeafBlock) {
|
||||||
// Block contains only ordinary terms:
|
// Block contains only ordinary terms:
|
||||||
subIndices = null;
|
subIndices = null;
|
||||||
|
StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
|
||||||
for (int i=start;i<end;i++) {
|
for (int i=start;i<end;i++) {
|
||||||
PendingEntry ent = pending.get(i);
|
PendingEntry ent = pending.get(i);
|
||||||
assert ent.isTerm: "i=" + i;
|
assert ent.isTerm: "i=" + i;
|
||||||
|
@ -722,19 +756,17 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
|
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
|
||||||
|
|
||||||
// Write term stats, to separate byte[] blob:
|
// Write term stats, to separate byte[] blob:
|
||||||
statsWriter.writeVInt(state.docFreq);
|
statsWriter.add(state.docFreq, state.totalTermFreq);
|
||||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
|
|
||||||
assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
|
|
||||||
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write term meta data
|
// Write term meta data
|
||||||
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
|
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
|
||||||
absolute = false;
|
absolute = false;
|
||||||
}
|
}
|
||||||
|
statsWriter.finish();
|
||||||
} else {
|
} else {
|
||||||
// Block has at least one prefix term or a sub block:
|
// Block has at least one prefix term or a sub block:
|
||||||
subIndices = new ArrayList<>();
|
subIndices = new ArrayList<>();
|
||||||
|
StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
|
||||||
for (int i=start;i<end;i++) {
|
for (int i=start;i<end;i++) {
|
||||||
PendingEntry ent = pending.get(i);
|
PendingEntry ent = pending.get(i);
|
||||||
if (ent.isTerm) {
|
if (ent.isTerm) {
|
||||||
|
@ -759,11 +791,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
suffixWriter.append(term.termBytes, prefixLength, suffix);
|
suffixWriter.append(term.termBytes, prefixLength, suffix);
|
||||||
|
|
||||||
// Write term stats, to separate byte[] blob:
|
// Write term stats, to separate byte[] blob:
|
||||||
statsWriter.writeVInt(state.docFreq);
|
statsWriter.add(state.docFreq, state.totalTermFreq);
|
||||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
|
|
||||||
assert state.totalTermFreq >= state.docFreq;
|
|
||||||
statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: now that terms dict "sees" these longs,
|
// TODO: now that terms dict "sees" these longs,
|
||||||
// we can explore better column-stride encodings
|
// we can explore better column-stride encodings
|
||||||
|
@ -803,6 +831,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
subIndices.add(block.index);
|
subIndices.add(block.index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
statsWriter.finish();
|
||||||
|
|
||||||
assert subIndices.size() != 0;
|
assert subIndices.size() != 0;
|
||||||
}
|
}
|
||||||
|
@ -813,11 +842,16 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
|
// makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
|
||||||
// auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
|
// auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
|
||||||
if (suffixWriter.length() > 2L * numEntries) {
|
if (suffixWriter.length() > 2L * numEntries) {
|
||||||
LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
|
// LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try it out if the
|
||||||
if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
|
// average suffix length is greater than 6.
|
||||||
// LZ4 saved more than 25%, go for it
|
if (suffixWriter.length() > 6L * numEntries) {
|
||||||
compressionAlg = CompressionAlgorithm.LZ4;
|
LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
|
||||||
} else {
|
if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
|
||||||
|
// LZ4 saved more than 25%, go for it
|
||||||
|
compressionAlg = CompressionAlgorithm.LZ4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) {
|
||||||
spareWriter.reset();
|
spareWriter.reset();
|
||||||
if (spareBytes.length < suffixWriter.length()) {
|
if (spareBytes.length < suffixWriter.length()) {
|
||||||
spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
|
spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
|
||||||
|
@ -851,25 +885,15 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
termsOut.writeVInt((numSuffixBytes << 1) | 1);
|
termsOut.writeVInt((numSuffixBytes << 1) | 1);
|
||||||
termsOut.writeByte(spareBytes[0]);
|
termsOut.writeByte(spareBytes[0]);
|
||||||
} else {
|
} else {
|
||||||
// Still give LZ4 a chance, there might be runs of terms with the same length
|
|
||||||
termsOut.writeVInt(numSuffixBytes << 1);
|
termsOut.writeVInt(numSuffixBytes << 1);
|
||||||
LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable);
|
termsOut.writeBytes(spareBytes, numSuffixBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stats
|
// Stats
|
||||||
final int numStatsBytes = Math.toIntExact(statsWriter.size());
|
final int numStatsBytes = Math.toIntExact(statsWriter.size());
|
||||||
spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes);
|
termsOut.writeVInt(numStatsBytes);
|
||||||
statsWriter.copyTo(new ByteArrayDataOutput(spareBytes));
|
statsWriter.copyTo(termsOut);
|
||||||
statsWriter.reset();
|
statsWriter.reset();
|
||||||
if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) {
|
|
||||||
// ID fields would typically have blocks full of ones
|
|
||||||
// LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster
|
|
||||||
termsOut.writeVInt((numStatsBytes << 1) | 1);
|
|
||||||
} else {
|
|
||||||
// Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones
|
|
||||||
termsOut.writeVInt(numStatsBytes << 1);
|
|
||||||
LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write term meta data byte[] blob
|
// Write term meta data byte[] blob
|
||||||
termsOut.writeVInt((int) metaWriter.size());
|
termsOut.writeVInt((int) metaWriter.size());
|
||||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.automaton.Transition;
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
import org.apache.lucene.util.compress.LZ4;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
// TODO: can we share this with the frame in STE?
|
// TODO: can we share this with the frame in STE?
|
||||||
|
@ -55,6 +54,7 @@ final class IntersectTermsEnumFrame {
|
||||||
final ByteArrayDataInput suffixLengthsReader;
|
final ByteArrayDataInput suffixLengthsReader;
|
||||||
|
|
||||||
byte[] statBytes = new byte[64];
|
byte[] statBytes = new byte[64];
|
||||||
|
int statsSingletonRunLength = 0;
|
||||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
byte[] floorData = new byte[32];
|
byte[] floorData = new byte[32];
|
||||||
|
@ -210,7 +210,7 @@ final class IntersectTermsEnumFrame {
|
||||||
if (allEqual) {
|
if (allEqual) {
|
||||||
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte());
|
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte());
|
||||||
} else {
|
} else {
|
||||||
LZ4.decompress(ite.in, numSuffixLengthBytes, suffixLengthBytes, 0);
|
ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes);
|
||||||
}
|
}
|
||||||
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
|
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
|
||||||
} else {
|
} else {
|
||||||
|
@ -226,24 +226,12 @@ final class IntersectTermsEnumFrame {
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
int numBytes = ite.in.readVInt();
|
int numBytes = ite.in.readVInt();
|
||||||
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
|
if (statBytes.length < numBytes) {
|
||||||
final boolean allOnes = (numBytes & 0x01) != 0;
|
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||||
numBytes >>>= 1;
|
|
||||||
if (statBytes.length < numBytes) {
|
|
||||||
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
|
||||||
}
|
|
||||||
if (allOnes) {
|
|
||||||
Arrays.fill(statBytes, 0, numBytes, (byte) 1);
|
|
||||||
} else {
|
|
||||||
LZ4.decompress(ite.in, numBytes, statBytes, 0);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (statBytes.length < numBytes) {
|
|
||||||
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
|
||||||
}
|
|
||||||
ite.in.readBytes(statBytes, 0, numBytes);
|
|
||||||
}
|
}
|
||||||
|
ite.in.readBytes(statBytes, 0, numBytes);
|
||||||
statsReader.reset(statBytes, 0, numBytes);
|
statsReader.reset(statBytes, 0, numBytes);
|
||||||
|
statsSingletonRunLength = 0;
|
||||||
metaDataUpto = 0;
|
metaDataUpto = 0;
|
||||||
|
|
||||||
termState.termBlockOrd = 0;
|
termState.termBlockOrd = 0;
|
||||||
|
@ -326,11 +314,35 @@ final class IntersectTermsEnumFrame {
|
||||||
// just skipN here:
|
// just skipN here:
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
termState.docFreq = statsReader.readVInt();
|
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
|
||||||
if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
if (statsSingletonRunLength > 0) {
|
||||||
termState.totalTermFreq = termState.docFreq; // all postings have freq=1
|
termState.docFreq = 1;
|
||||||
|
termState.totalTermFreq = 1;
|
||||||
|
statsSingletonRunLength--;
|
||||||
|
} else {
|
||||||
|
int token = statsReader.readVInt();
|
||||||
|
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES && (token & 1) == 1) {
|
||||||
|
termState.docFreq = 1;
|
||||||
|
termState.totalTermFreq = 1;
|
||||||
|
statsSingletonRunLength = token >>> 1;
|
||||||
|
} else {
|
||||||
|
termState.docFreq = token >>> 1;
|
||||||
|
if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
||||||
|
termState.totalTermFreq = termState.docFreq;
|
||||||
|
} else {
|
||||||
|
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
|
termState.docFreq = statsReader.readVInt();
|
||||||
|
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
|
||||||
|
if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
||||||
|
termState.totalTermFreq = termState.docFreq; // all postings have freq=1
|
||||||
|
} else {
|
||||||
|
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
|
||||||
|
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// metadata
|
// metadata
|
||||||
ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute);
|
ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute);
|
||||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.compress.LZ4;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
final class SegmentTermsEnumFrame {
|
final class SegmentTermsEnumFrame {
|
||||||
|
@ -46,7 +45,7 @@ final class SegmentTermsEnumFrame {
|
||||||
long fp;
|
long fp;
|
||||||
long fpOrig;
|
long fpOrig;
|
||||||
long fpEnd;
|
long fpEnd;
|
||||||
long totalSuffixBytes, totalStatsBytes; // for stats
|
long totalSuffixBytes; // for stats
|
||||||
|
|
||||||
byte[] suffixBytes = new byte[128];
|
byte[] suffixBytes = new byte[128];
|
||||||
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
|
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
|
||||||
|
@ -55,6 +54,7 @@ final class SegmentTermsEnumFrame {
|
||||||
final ByteArrayDataInput suffixLengthsReader;
|
final ByteArrayDataInput suffixLengthsReader;
|
||||||
|
|
||||||
byte[] statBytes = new byte[64];
|
byte[] statBytes = new byte[64];
|
||||||
|
int statsSingletonRunLength = 0;
|
||||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
byte[] floorData = new byte[32];
|
byte[] floorData = new byte[32];
|
||||||
|
@ -202,7 +202,7 @@ final class SegmentTermsEnumFrame {
|
||||||
if (allEqual) {
|
if (allEqual) {
|
||||||
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte());
|
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte());
|
||||||
} else {
|
} else {
|
||||||
LZ4.decompress(ste.in, numSuffixLengthBytes, suffixLengthBytes, 0);
|
ste.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes);
|
||||||
}
|
}
|
||||||
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
|
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
|
||||||
} else {
|
} else {
|
||||||
|
@ -226,27 +226,13 @@ final class SegmentTermsEnumFrame {
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
final long startStatsFP = ste.in.getFilePointer();
|
|
||||||
int numBytes = ste.in.readVInt();
|
int numBytes = ste.in.readVInt();
|
||||||
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
|
if (statBytes.length < numBytes) {
|
||||||
final boolean allOnes = (numBytes & 0x01) != 0;
|
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||||
numBytes >>>= 1;
|
|
||||||
if (statBytes.length < numBytes) {
|
|
||||||
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
|
||||||
}
|
|
||||||
if (allOnes) {
|
|
||||||
Arrays.fill(statBytes, 0, numBytes, (byte) 1);
|
|
||||||
} else {
|
|
||||||
LZ4.decompress(ste.in, numBytes, statBytes, 0);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (statBytes.length < numBytes) {
|
|
||||||
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
|
|
||||||
}
|
|
||||||
ste.in.readBytes(statBytes, 0, numBytes);
|
|
||||||
}
|
}
|
||||||
totalStatsBytes = ste.in.getFilePointer() - startStatsFP;
|
ste.in.readBytes(statBytes, 0, numBytes);
|
||||||
statsReader.reset(statBytes, 0, numBytes);
|
statsReader.reset(statBytes, 0, numBytes);
|
||||||
|
statsSingletonRunLength = 0;
|
||||||
metaDataUpto = 0;
|
metaDataUpto = 0;
|
||||||
|
|
||||||
state.termBlockOrd = 0;
|
state.termBlockOrd = 0;
|
||||||
|
@ -473,15 +459,38 @@ final class SegmentTermsEnumFrame {
|
||||||
// TODO: if docFreq were bulk decoded we could
|
// TODO: if docFreq were bulk decoded we could
|
||||||
// just skipN here:
|
// just skipN here:
|
||||||
|
|
||||||
// stats
|
if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
|
||||||
state.docFreq = statsReader.readVInt();
|
if (statsSingletonRunLength > 0) {
|
||||||
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
|
state.docFreq = 1;
|
||||||
if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
state.totalTermFreq = 1;
|
||||||
state.totalTermFreq = state.docFreq; // all postings have freq=1
|
statsSingletonRunLength--;
|
||||||
|
} else {
|
||||||
|
int token = statsReader.readVInt();
|
||||||
|
if ((token & 1) == 1) {
|
||||||
|
state.docFreq = 1;
|
||||||
|
state.totalTermFreq = 1;
|
||||||
|
statsSingletonRunLength = token >>> 1;
|
||||||
|
} else {
|
||||||
|
state.docFreq = token >>> 1;
|
||||||
|
if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
||||||
|
state.totalTermFreq = state.docFreq;
|
||||||
|
} else {
|
||||||
|
state.totalTermFreq = state.docFreq + statsReader.readVLong();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
state.totalTermFreq = state.docFreq + statsReader.readVLong();
|
assert statsSingletonRunLength == 0;
|
||||||
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
|
state.docFreq = statsReader.readVInt();
|
||||||
|
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
|
||||||
|
if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
|
||||||
|
state.totalTermFreq = state.docFreq; // all postings have freq=1
|
||||||
|
} else {
|
||||||
|
state.totalTermFreq = state.docFreq + statsReader.readVLong();
|
||||||
|
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// metadata
|
// metadata
|
||||||
ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute);
|
ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute);
|
||||||
|
|
||||||
|
|
|
@ -91,9 +91,6 @@ public class Stats {
|
||||||
* stores. */
|
* stores. */
|
||||||
public long totalBlockStatsBytes;
|
public long totalBlockStatsBytes;
|
||||||
|
|
||||||
/** Total number of bytes used to store stats. */
|
|
||||||
public long totalUncompressedBlockStatsBytes;
|
|
||||||
|
|
||||||
/** Total bytes stored by the {@link PostingsReaderBase},
|
/** Total bytes stored by the {@link PostingsReaderBase},
|
||||||
* plus the other few vInts stored in the frame. */
|
* plus the other few vInts stored in the frame. */
|
||||||
public long totalBlockOtherBytes;
|
public long totalBlockOtherBytes;
|
||||||
|
@ -130,9 +127,8 @@ public class Stats {
|
||||||
if (frame.suffixesReader != frame.suffixLengthsReader) {
|
if (frame.suffixesReader != frame.suffixLengthsReader) {
|
||||||
totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length();
|
totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length();
|
||||||
}
|
}
|
||||||
totalBlockStatsBytes += frame.totalStatsBytes;
|
totalBlockStatsBytes += frame.statsReader.length();
|
||||||
compressionAlgorithms[frame.compressionAlg.code]++;
|
compressionAlgorithms[frame.compressionAlg.code]++;
|
||||||
totalUncompressedBlockStatsBytes += frame.statsReader.length();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void endBlock(SegmentTermsEnumFrame frame) {
|
void endBlock(SegmentTermsEnumFrame frame) {
|
||||||
|
@ -149,7 +145,7 @@ public class Stats {
|
||||||
throw new IllegalStateException();
|
throw new IllegalStateException();
|
||||||
}
|
}
|
||||||
endBlockCount++;
|
endBlockCount++;
|
||||||
final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.totalStatsBytes;
|
final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.statsReader.length();
|
||||||
assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
|
assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
|
||||||
totalBlockOtherBytes += otherBytes;
|
totalBlockOtherBytes += otherBytes;
|
||||||
}
|
}
|
||||||
|
@ -202,8 +198,7 @@ public class Stats {
|
||||||
}
|
}
|
||||||
out.println(" " + totalBlockSuffixBytes + " compressed term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes)/totalUncompressedBlockSuffixBytes) +
|
out.println(" " + totalBlockSuffixBytes + " compressed term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes)/totalUncompressedBlockSuffixBytes) +
|
||||||
" compression ratio - compression count by algorithm: " + compressionCounts : "") + ")");
|
" compression ratio - compression count by algorithm: " + compressionCounts : "") + ")");
|
||||||
out.println(" " + totalUncompressedBlockStatsBytes + " term stats bytes before compression" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalUncompressedBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
|
out.println(" " + totalBlockStatsBytes + " term stats bytes " + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
|
||||||
out.println(" " + totalBlockStatsBytes + " compressed term stats bytes (" + String.format(Locale.ROOT, "%.2f", (double)totalBlockStatsBytes / totalUncompressedBlockStatsBytes) + " compression ratio)");
|
|
||||||
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
|
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
|
||||||
if (totalBlockCount != 0) {
|
if (totalBlockCount != 0) {
|
||||||
out.println(" by prefix length:");
|
out.println(" by prefix length:");
|
||||||
|
|
Loading…
Reference in New Issue