diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 5f386e26a63..f77e44eba05 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -509,6 +509,39 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { static final BytesRef EMPTY_BYTES_REF = new BytesRef(); + private static class StatsWriter { + + private final DataOutput out; + private final boolean hasFreqs; + private int singletonCount; + + StatsWriter(DataOutput out, boolean hasFreqs) { + this.out = out; + this.hasFreqs = hasFreqs; + } + + void add(int df, long ttf) throws IOException { + // Singletons (DF==1, TTF==1) are run-length encoded + if (df == 1 && (hasFreqs == false || ttf == 1)) { + singletonCount++; + } else { + finish(); + out.writeVInt(df << 1); + if (hasFreqs) { + out.writeVLong(ttf - df); + } + } + } + + void finish() throws IOException { + if (singletonCount > 0) { + out.writeVInt(((singletonCount - 1) << 1) | 1); + singletonCount = 0; + } + } + + } + class TermsWriter { private final FieldInfo fieldInfo; private long numTerms; @@ -700,6 +733,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { if (isLeafBlock) { // Block contains only ordinary terms: subIndices = null; + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); for (int i=start;i(); + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); for (int i=start;i= state.docFreq; - statsWriter.writeVLong(state.totalTermFreq - state.docFreq); - } + statsWriter.add(state.docFreq, state.totalTermFreq); // TODO: now that terms dict "sees" these longs, // we can explore better column-stride encodings @@ -803,6 +831,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { subIndices.add(block.index); } } + statsWriter.finish(); assert subIndices.size() != 0; } @@ -813,11 +842,16 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much. if (suffixWriter.length() > 2L * numEntries) { - LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); - if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { - // LZ4 saved more than 25%, go for it - compressionAlg = CompressionAlgorithm.LZ4; - } else { + // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try it out if the + // average suffix length is greater than 6. + if (suffixWriter.length() > 6L * numEntries) { + LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); + if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { + // LZ4 saved more than 25%, go for it + compressionAlg = CompressionAlgorithm.LZ4; + } + } + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { spareWriter.reset(); if (spareBytes.length < suffixWriter.length()) { spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)]; @@ -851,25 +885,15 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { termsOut.writeVInt((numSuffixBytes << 1) | 1); termsOut.writeByte(spareBytes[0]); } else { - // Still give LZ4 a chance, there might be runs of terms with the same length termsOut.writeVInt(numSuffixBytes << 1); - LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable); + termsOut.writeBytes(spareBytes, numSuffixBytes); } // Stats final int numStatsBytes = Math.toIntExact(statsWriter.size()); - spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes); - statsWriter.copyTo(new ByteArrayDataOutput(spareBytes)); + termsOut.writeVInt(numStatsBytes); + statsWriter.copyTo(termsOut); statsWriter.reset(); - if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) { - // ID fields would typically have blocks full of ones - // LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster - termsOut.writeVInt((numStatsBytes << 1) | 1); - } else { - // Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones - termsOut.writeVInt(numStatsBytes << 1); - LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable); - } // Write term meta data byte[] blob termsOut.writeVInt((int) metaWriter.size()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java index f4e08adb9a9..b4e5821bb00 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java @@ -27,7 +27,6 @@ import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.compress.LZ4; import org.apache.lucene.util.fst.FST; // TODO: can we share this with the frame in STE? @@ -55,6 +54,7 @@ final class IntersectTermsEnumFrame { final ByteArrayDataInput suffixLengthsReader; byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); byte[] floorData = new byte[32]; @@ -210,7 +210,7 @@ final class IntersectTermsEnumFrame { if (allEqual) { Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte()); } else { - LZ4.decompress(ite.in, numSuffixLengthBytes, suffixLengthBytes, 0); + ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); } suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); } else { @@ -226,24 +226,12 @@ final class IntersectTermsEnumFrame { // stats int numBytes = ite.in.readVInt(); - if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { - final boolean allOnes = (numBytes & 0x01) != 0; - numBytes >>>= 1; - if (statBytes.length < numBytes) { - statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - if (allOnes) { - Arrays.fill(statBytes, 0, numBytes, (byte) 1); - } else { - LZ4.decompress(ite.in, numBytes, statBytes, 0); - } - } else { - if (statBytes.length < numBytes) { - statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - ite.in.readBytes(statBytes, 0, numBytes); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; } + ite.in.readBytes(statBytes, 0, numBytes); statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; metaDataUpto = 0; termState.termBlockOrd = 0; @@ -326,11 +314,35 @@ final class IntersectTermsEnumFrame { // just skipN here: // stats - termState.docFreq = statsReader.readVInt(); - if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { - termState.totalTermFreq = termState.docFreq; // all postings have freq=1 + if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + if (statsSingletonRunLength > 0) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES && (token & 1) == 1) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + termState.docFreq = token >>> 1; + if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + termState.totalTermFreq = termState.docFreq; + } else { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + } + } + } } else { - termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + termState.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + termState.totalTermFreq = termState.docFreq; // all postings have freq=1 + } else { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } } // metadata ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java index 7f691d340a0..19c321c3e49 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java @@ -27,7 +27,6 @@ import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.compress.LZ4; import org.apache.lucene.util.fst.FST; final class SegmentTermsEnumFrame { @@ -46,7 +45,7 @@ final class SegmentTermsEnumFrame { long fp; long fpOrig; long fpEnd; - long totalSuffixBytes, totalStatsBytes; // for stats + long totalSuffixBytes; // for stats byte[] suffixBytes = new byte[128]; final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); @@ -55,6 +54,7 @@ final class SegmentTermsEnumFrame { final ByteArrayDataInput suffixLengthsReader; byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); byte[] floorData = new byte[32]; @@ -202,7 +202,7 @@ final class SegmentTermsEnumFrame { if (allEqual) { Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte()); } else { - LZ4.decompress(ste.in, numSuffixLengthBytes, suffixLengthBytes, 0); + ste.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); } suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); } else { @@ -226,27 +226,13 @@ final class SegmentTermsEnumFrame { }*/ // stats - final long startStatsFP = ste.in.getFilePointer(); int numBytes = ste.in.readVInt(); - if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { - final boolean allOnes = (numBytes & 0x01) != 0; - numBytes >>>= 1; - if (statBytes.length < numBytes) { - statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - if (allOnes) { - Arrays.fill(statBytes, 0, numBytes, (byte) 1); - } else { - LZ4.decompress(ste.in, numBytes, statBytes, 0); - } - } else { - if (statBytes.length < numBytes) { - statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - ste.in.readBytes(statBytes, 0, numBytes); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; } - totalStatsBytes = ste.in.getFilePointer() - startStatsFP; + ste.in.readBytes(statBytes, 0, numBytes); statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; metaDataUpto = 0; state.termBlockOrd = 0; @@ -473,15 +459,38 @@ final class SegmentTermsEnumFrame { // TODO: if docFreq were bulk decoded we could // just skipN here: - // stats - state.docFreq = statsReader.readVInt(); - //if (DEBUG) System.out.println(" dF=" + state.docFreq); - if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { - state.totalTermFreq = state.docFreq; // all postings have freq=1 + if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + if (statsSingletonRunLength > 0) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if ((token & 1) == 1) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + state.docFreq = token >>> 1; + if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + state.totalTermFreq = state.docFreq; + } else { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + } + } + } } else { - state.totalTermFreq = state.docFreq + statsReader.readVLong(); - //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + assert statsSingletonRunLength == 0; + state.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + state.totalTermFreq = state.docFreq; // all postings have freq=1 + } else { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } } + // metadata ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java index 0273f03c2c6..c6d12938bd9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java @@ -91,9 +91,6 @@ public class Stats { * stores. */ public long totalBlockStatsBytes; - /** Total number of bytes used to store stats. */ - public long totalUncompressedBlockStatsBytes; - /** Total bytes stored by the {@link PostingsReaderBase}, * plus the other few vInts stored in the frame. */ public long totalBlockOtherBytes; @@ -130,9 +127,8 @@ public class Stats { if (frame.suffixesReader != frame.suffixLengthsReader) { totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length(); } - totalBlockStatsBytes += frame.totalStatsBytes; + totalBlockStatsBytes += frame.statsReader.length(); compressionAlgorithms[frame.compressionAlg.code]++; - totalUncompressedBlockStatsBytes += frame.statsReader.length(); } void endBlock(SegmentTermsEnumFrame frame) { @@ -149,7 +145,7 @@ public class Stats { throw new IllegalStateException(); } endBlockCount++; - final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.totalStatsBytes; + final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.statsReader.length(); assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; totalBlockOtherBytes += otherBytes; } @@ -202,8 +198,7 @@ public class Stats { } out.println(" " + totalBlockSuffixBytes + " compressed term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes)/totalUncompressedBlockSuffixBytes) + " compression ratio - compression count by algorithm: " + compressionCounts : "") + ")"); - out.println(" " + totalUncompressedBlockStatsBytes + " term stats bytes before compression" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalUncompressedBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); - out.println(" " + totalBlockStatsBytes + " compressed term stats bytes (" + String.format(Locale.ROOT, "%.2f", (double)totalBlockStatsBytes / totalUncompressedBlockStatsBytes) + " compression ratio)"); + out.println(" " + totalBlockStatsBytes + " term stats bytes " + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : "")); if (totalBlockCount != 0) { out.println(" by prefix length:");