LUCENE-4702: Reduce terms dictionary compression overhead. (#1216)

Changes include: - Removed LZ4 compression of suffix lengths which didn't save much space anyway. - For stats, LZ4 was only really used for run-length compression of terms whose docFreq is 1. This has been replaced by explicit run-length compression. - Since we only use LZ4 for suffix bytes if the compression ration is < 75%, we now only try LZ4 out if the average suffix length is greater than 6, in order to reduce index-time overhead.
2020-01-28 18:38:30 +01:00 · 2020-01-28 18:38:30 +01:00 · 6eb8834a57
parent 4773574578
commit 6eb8834a57
4 changed files with 126 additions and 86 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@ -509,6 +509,39 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
  static final BytesRef EMPTY_BYTES_REF = new BytesRef();
  private static class StatsWriter {
    private final DataOutput out;
    private final boolean hasFreqs;
    private int singletonCount;
    StatsWriter(DataOutput out, boolean hasFreqs) {
      this.out = out;
      this.hasFreqs = hasFreqs;
    }
    void add(int df, long ttf) throws IOException {
      // Singletons (DF==1, TTF==1) are run-length encoded
      if (df == 1 && (hasFreqs == false || ttf == 1)) {
        singletonCount++;
      } else {
        finish();
        out.writeVInt(df << 1);
        if (hasFreqs) {
          out.writeVLong(ttf - df);
        }
      }
    }
    void finish() throws IOException {
      if (singletonCount > 0) {
        out.writeVInt(((singletonCount - 1) << 1) | 1);
        singletonCount = 0;
      }
    }
  }
  class TermsWriter {
    private final FieldInfo fieldInfo;
    private long numTerms;
@ -700,6 +733,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
      if (isLeafBlock) {
        // Block contains only ordinary terms:
        subIndices = null;
        StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
        for (int i=start;i<end;i++) {
          PendingEntry ent = pending.get(i);
          assert ent.isTerm: "i=" + i;
@ -722,19 +756,17 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
          assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
          // Write term stats, to separate byte[] blob:
-          statsWriter.writeVInt(state.docFreq);
+          statsWriter.add(state.docFreq, state.totalTermFreq);
          if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
            assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
            statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
          }
          // Write term meta data
          postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
          absolute = false;
        }
        statsWriter.finish();
      } else {
        // Block has at least one prefix term or a sub block:
        subIndices = new ArrayList<>();
        StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
        for (int i=start;i<end;i++) {
          PendingEntry ent = pending.get(i);
          if (ent.isTerm) {
@ -759,11 +791,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
            suffixWriter.append(term.termBytes, prefixLength, suffix);
            // Write term stats, to separate byte[] blob:
-            statsWriter.writeVInt(state.docFreq);
+            statsWriter.add(state.docFreq, state.totalTermFreq);
            if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
              assert state.totalTermFreq >= state.docFreq;
              statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
            }
            // TODO: now that terms dict "sees" these longs,
            // we can explore better column-stride encodings
@ -803,6 +831,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
            subIndices.add(block.index);
          }
        }
        statsWriter.finish();
        assert subIndices.size() != 0;
      }
@ -813,11 +842,16 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
      // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
      // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
      if (suffixWriter.length() > 2L * numEntries) {
-        LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
+        // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try it out if the
-        if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
+        // average suffix length is greater than 6.
-          // LZ4 saved more than 25%, go for it
+        if (suffixWriter.length() > 6L * numEntries) {
-          compressionAlg = CompressionAlgorithm.LZ4;
+          LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
-        } else {
+          if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
            // LZ4 saved more than 25%, go for it
            compressionAlg = CompressionAlgorithm.LZ4;
          }
        }
        if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) {
          spareWriter.reset();
          if (spareBytes.length < suffixWriter.length()) {
            spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
@ -851,25 +885,15 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
        termsOut.writeVInt((numSuffixBytes << 1) | 1);
        termsOut.writeByte(spareBytes[0]);
      } else {
        // Still give LZ4 a chance, there might be runs of terms with the same length
        termsOut.writeVInt(numSuffixBytes << 1);
-        LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable);
+        termsOut.writeBytes(spareBytes, numSuffixBytes);
      }
      // Stats
      final int numStatsBytes = Math.toIntExact(statsWriter.size());
-      spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes);
+      termsOut.writeVInt(numStatsBytes);
-      statsWriter.copyTo(new ByteArrayDataOutput(spareBytes));
+      statsWriter.copyTo(termsOut);
      statsWriter.reset();
      if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) {
        // ID fields would typically have blocks full of ones
        // LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster
        termsOut.writeVInt((numStatsBytes << 1) | 1);
      } else {
        // Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones
        termsOut.writeVInt(numStatsBytes << 1);
        LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable);
      }
      // Write term meta data byte[] blob
      termsOut.writeVInt((int) metaWriter.size());
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
@ -27,7 +27,6 @@ import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.compress.LZ4;
 import org.apache.lucene.util.fst.FST;
 // TODO: can we share this with the frame in STE?
@ -55,6 +54,7 @@ final class IntersectTermsEnumFrame {
  final ByteArrayDataInput suffixLengthsReader;
  byte[] statBytes = new byte[64];
  int statsSingletonRunLength = 0;
  final ByteArrayDataInput statsReader = new ByteArrayDataInput();
  byte[] floorData = new byte[32];
@ -210,7 +210,7 @@ final class IntersectTermsEnumFrame {
      if (allEqual) {
        Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte());
      } else {
-        LZ4.decompress(ite.in, numSuffixLengthBytes, suffixLengthBytes, 0);
+        ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes);
      }
      suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
    } else {
@ -226,24 +226,12 @@ final class IntersectTermsEnumFrame {
    // stats
    int numBytes = ite.in.readVInt();
-    if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
+    if (statBytes.length < numBytes) {
-      final boolean allOnes = (numBytes & 0x01) != 0;
+      statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      numBytes >>>= 1;
      if (statBytes.length < numBytes) {
        statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      }
      if (allOnes) {
        Arrays.fill(statBytes, 0, numBytes, (byte) 1);
      } else {
        LZ4.decompress(ite.in, numBytes, statBytes, 0);
      }
    } else {
      if (statBytes.length < numBytes) {
        statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      }
      ite.in.readBytes(statBytes, 0, numBytes);
    }
    ite.in.readBytes(statBytes, 0, numBytes);
    statsReader.reset(statBytes, 0, numBytes);
    statsSingletonRunLength = 0;
    metaDataUpto = 0;
    termState.termBlockOrd = 0;
@ -326,11 +314,35 @@ final class IntersectTermsEnumFrame {
      // just skipN here:
      // stats
-      termState.docFreq = statsReader.readVInt();
+      if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
-      if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
+        if (statsSingletonRunLength > 0) {
-        termState.totalTermFreq = termState.docFreq; // all postings have freq=1
+          termState.docFreq = 1;
          termState.totalTermFreq = 1;
          statsSingletonRunLength--;
        } else {
          int token = statsReader.readVInt();
          if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES && (token & 1) == 1) {
            termState.docFreq = 1;
            termState.totalTermFreq = 1;
            statsSingletonRunLength = token >>> 1;
          } else {
            termState.docFreq = token >>> 1;
            if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
              termState.totalTermFreq = termState.docFreq;
            } else {
              termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
            }
          }
        }
      } else {
-        termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
+        termState.docFreq = statsReader.readVInt();
        //if (DEBUG) System.out.println("    dF=" + state.docFreq);
        if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
          termState.totalTermFreq = termState.docFreq; // all postings have freq=1
        } else {
          termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
          //if (DEBUG) System.out.println("    totTF=" + state.totalTermFreq);
        }
      }
      // metadata
      ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute);
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
@ -27,7 +27,6 @@ import org.apache.lucene.index.TermsEnum.SeekStatus;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.compress.LZ4;
 import org.apache.lucene.util.fst.FST;
 final class SegmentTermsEnumFrame {
@ -46,7 +45,7 @@ final class SegmentTermsEnumFrame {
  long fp;
  long fpOrig;
  long fpEnd;
-  long totalSuffixBytes, totalStatsBytes; // for stats
+  long totalSuffixBytes; // for stats
  byte[] suffixBytes = new byte[128];
  final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
@ -55,6 +54,7 @@ final class SegmentTermsEnumFrame {
  final ByteArrayDataInput suffixLengthsReader;
  byte[] statBytes = new byte[64];
  int statsSingletonRunLength = 0;
  final ByteArrayDataInput statsReader = new ByteArrayDataInput();
  byte[] floorData = new byte[32];
@ -202,7 +202,7 @@ final class SegmentTermsEnumFrame {
      if (allEqual) {
        Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte());
      } else {
-        LZ4.decompress(ste.in, numSuffixLengthBytes, suffixLengthBytes, 0);
+        ste.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes);
      }
      suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
    } else {
@ -226,27 +226,13 @@ final class SegmentTermsEnumFrame {
      }*/
    // stats
    final long startStatsFP = ste.in.getFilePointer();
    int numBytes = ste.in.readVInt();
-    if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
+    if (statBytes.length < numBytes) {
-      final boolean allOnes = (numBytes & 0x01) != 0;
+      statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      numBytes >>>= 1;
      if (statBytes.length < numBytes) {
        statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      }
      if (allOnes) {
        Arrays.fill(statBytes, 0, numBytes, (byte) 1);
      } else {
        LZ4.decompress(ste.in, numBytes, statBytes, 0);
      }
    } else {
      if (statBytes.length < numBytes) {
        statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
      }
      ste.in.readBytes(statBytes, 0, numBytes);
    }
-    totalStatsBytes = ste.in.getFilePointer() - startStatsFP;
+    ste.in.readBytes(statBytes, 0, numBytes);
    statsReader.reset(statBytes, 0, numBytes);
    statsSingletonRunLength = 0;
    metaDataUpto = 0;
    state.termBlockOrd = 0;
@ -473,15 +459,38 @@ final class SegmentTermsEnumFrame {
      // TODO: if docFreq were bulk decoded we could
      // just skipN here:
-      // stats
+      if (version >= BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
-      state.docFreq = statsReader.readVInt();
+        if (statsSingletonRunLength > 0) {
-      //if (DEBUG) System.out.println("    dF=" + state.docFreq);
+          state.docFreq = 1;
-      if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
+          state.totalTermFreq = 1;
-        state.totalTermFreq = state.docFreq; // all postings have freq=1
+          statsSingletonRunLength--;
        } else {
          int token = statsReader.readVInt();
          if ((token & 1) == 1) {
            state.docFreq = 1;
            state.totalTermFreq = 1;
            statsSingletonRunLength = token >>> 1;
          } else {
            state.docFreq = token >>> 1;
            if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
              state.totalTermFreq = state.docFreq;
            } else {
              state.totalTermFreq = state.docFreq + statsReader.readVLong();
            }
          }
        }
      } else {
-        state.totalTermFreq = state.docFreq + statsReader.readVLong();
+        assert statsSingletonRunLength == 0;
-        //if (DEBUG) System.out.println("    totTF=" + state.totalTermFreq);
+        state.docFreq = statsReader.readVInt();
        //if (DEBUG) System.out.println("    dF=" + state.docFreq);
        if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
          state.totalTermFreq = state.docFreq; // all postings have freq=1
        } else {
          state.totalTermFreq = state.docFreq + statsReader.readVLong();
          //if (DEBUG) System.out.println("    totTF=" + state.totalTermFreq);
        }
      }
      // metadata
      ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute);
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java
@ -91,9 +91,6 @@ public class Stats {
   *  stores. */
  public long totalBlockStatsBytes;
  /** Total number of bytes used to store stats. */
  public long totalUncompressedBlockStatsBytes;
  /** Total bytes stored by the {@link PostingsReaderBase},
   *  plus the other few vInts stored in the frame. */
  public long totalBlockOtherBytes;
@ -130,9 +127,8 @@ public class Stats {
    if (frame.suffixesReader != frame.suffixLengthsReader) {
      totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length();
    }
-    totalBlockStatsBytes += frame.totalStatsBytes;
+    totalBlockStatsBytes += frame.statsReader.length();
    compressionAlgorithms[frame.compressionAlg.code]++;
    totalUncompressedBlockStatsBytes += frame.statsReader.length();
  }
  void endBlock(SegmentTermsEnumFrame frame) {
@ -149,7 +145,7 @@ public class Stats {
      throw new IllegalStateException();
    }
    endBlockCount++;
-    final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.totalStatsBytes;
+    final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.statsReader.length();
    assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
    totalBlockOtherBytes += otherBytes;
  }
@ -202,8 +198,7 @@ public class Stats {
    }
    out.println("    " + totalBlockSuffixBytes + " compressed term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes)/totalUncompressedBlockSuffixBytes) +
        " compression ratio - compression count by algorithm: " + compressionCounts : "") + ")");
-    out.println("    " + totalUncompressedBlockStatsBytes + " term stats bytes before compression" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalUncompressedBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
+    out.println("    " + totalBlockStatsBytes + " term stats bytes " + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
    out.println("    " + totalBlockStatsBytes + " compressed term stats bytes (" + String.format(Locale.ROOT, "%.2f", (double)totalBlockStatsBytes / totalUncompressedBlockStatsBytes) + " compression ratio)");
    out.println("    " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
    if (totalBlockCount != 0) {
      out.println("    by prefix length:");