Write MSB VLong for better outputs sharing in block tree index (#12631)

2023-10-10 01:00:21 -05:00 · 2023-10-10 01:00:21 -05:00 · 4f01de2a2d
parent e20e245f47
commit 4f01de2a2d
7 changed files with 107 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -178,6 +178,9 @@ Optimizations

 * GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)

+* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size
+  of .tip file. (Guo Feng)
+
 Changes in runtime behavior
 ---------------------

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
@ -16,12 +16,15 @@
 */
 package org.apache.lucene.codecs.lucene90.blocktree;

+import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
+
 import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
    // + rootCode + " divisor=" + indexDivisor);
    // }
    rootBlockFP =
-        (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
+        readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
            >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
    // Initialize FST always off-heap.
    final IndexInput clone = indexIn.clone();
@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
    */
  }

+  long readVLongOutput(DataInput in) throws IOException {
+    if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
+      return readMSBVLong(in);
+    } else {
+      return in.readVLong();
+    }
+  }
+
+  /**
+   * Decodes a variable length byte[] in MSB order back to long, as written by {@link
+   * Lucene90BlockTreeTermsWriter#writeMSBVLong}.
+   *
+   * <p>Package private for testing.
+   */
+  static long readMSBVLong(DataInput in) throws IOException {
+    long l = 0L;
+    while (true) {
+      byte b = in.readByte();
+      l = (l << 7) | (b & 0x7FL);
+      if ((b & 0x80) == 0) {
+        break;
+      }
+    }
+    return l;
+  }
+
  @Override
  public BytesRef getMin() throws IOException {
    if (minTerm == null) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
@ -146,7 +146,7 @@ final class IntersectTermsEnumFrame {
      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
      // Skip first long -- has redundant fp, hasTerms
      // flag, isFloor flag
-      final long code = floorDataReader.readVLong();
+      final long code = ite.fr.readVLongOutput(floorDataReader);
      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
        // Floor frame
        numFollowFloorBlocks = floorDataReader.readVInt();
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
  /** Initial terms format. */
  public static final int VERSION_START = 0;

+  /**
+   * Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
+   */
+  public static final int VERSION_MSB_VLONG_OUTPUT = 1;
+
  /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_START;
+  public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;

  /** Extension of terms index file */
  static final String TERMS_INDEX_EXTENSION = "tip";
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
@ -430,6 +430,25 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
    return brToString(new BytesRef(b));
  }

+  /**
+   * Encodes long value to variable length byte[], in MSB order. Use {@link
+   * FieldReader#readMSBVLong} to decode.
+   *
+   * <p>Package private for testing
+   */
+  static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
+    assert l >= 0;
+    // Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
+    // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
+    final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
+    l <<= Long.SIZE - bytesNeeded * 7;
+    for (int i = 1; i < bytesNeeded; i++) {
+      scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
+      l = l << 7;
+    }
+    scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
+  }
+
  private static final class PendingBlock extends PendingEntry {
    public final BytesRef prefix;
    public final long fp;
@ -472,10 +491,8 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {

      assert scratchBytes.size() == 0;

-      // TODO: try writing the leading vLong in MSB order
-      // (opposite of what Lucene does today), for better
-      // outputs sharing in the FST
-      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+      // write the leading vLong in MSB order for better outputs sharing in the FST
+      writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
      if (isFloor) {
        scratchBytes.writeVInt(blocks.size() - 1);
        for (int i = 1; i < blocks.size(); i++) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@ -236,7 +236,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
  SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
      throws IOException {
    scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
-    final long code = scratchReader.readVLong();
+    final long code = fr.readVLongOutput(scratchReader);
    final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
    final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
    f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
@ -980,7 +980,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
          } else if (isSeekFrame && !f.isFloor) {
            final ByteArrayDataInput reader =
                new ByteArrayDataInput(output.bytes, output.offset, output.length);
-            final long codeOrig = reader.readVLong();
+            final long codeOrig = fr.readVLongOutput(reader);
            final long code =
                (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                    | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java
@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.ArrayUtil;
+
+public class TestMSBVLong extends LuceneTestCase {
+
+  public void testMSBVLong() throws IOException {
+    assertMSBVLong(Long.MAX_VALUE);
+    int iter = atLeast(10000);
+    for (long i = 0; i < iter; i++) {
+      assertMSBVLong(i);
+    }
+  }
+
+  private static void assertMSBVLong(long l) throws IOException {
+    byte[] bytes = new byte[10];
+    ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
+    Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
+    ByteArrayDataInput in =
+        new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
+    long recovered = FieldReader.readMSBVLong(in);
+    assertEquals(l + " != " + recovered, l, recovered);
+  }
+}