Write MSB VLong for better outputs sharing in block tree index (#12631)

2023-10-10 01:00:21 -05:00 · 2023-10-10 01:00:21 -05:00 · 4f01de2a2d
parent e20e245f47
commit 4f01de2a2d
7 changed files with 107 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -178,6 +178,9 @@ Optimizations
 * GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)
 * GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size
  of .tip file. (Guo Feng)
 Changes in runtime behavior
 ---------------------
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
@ -16,12 +16,15 @@
 */
 package org.apache.lucene.codecs.lucene90.blocktree;
 import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
 import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
    // + rootCode + " divisor=" + indexDivisor);
    // }
    rootBlockFP =
-        (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
+        readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
            >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
    // Initialize FST always off-heap.
    final IndexInput clone = indexIn.clone();
@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
    */
  }
  long readVLongOutput(DataInput in) throws IOException {
    if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
      return readMSBVLong(in);
    } else {
      return in.readVLong();
    }
  }
  /**
   * Decodes a variable length byte[] in MSB order back to long, as written by {@link
   * Lucene90BlockTreeTermsWriter#writeMSBVLong}.
   *
   * <p>Package private for testing.
   */
  static long readMSBVLong(DataInput in) throws IOException {
    long l = 0L;
    while (true) {
      byte b = in.readByte();
      l = (l << 7) | (b & 0x7FL);
      if ((b & 0x80) == 0) {
        break;
      }
    }
    return l;
  }
  @Override
  public BytesRef getMin() throws IOException {
    if (minTerm == null) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
@ -146,7 +146,7 @@ final class IntersectTermsEnumFrame {
      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
      // Skip first long -- has redundant fp, hasTerms
      // flag, isFloor flag
-      final long code = floorDataReader.readVLong();
+      final long code = ite.fr.readVLongOutput(floorDataReader);
      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
        // Floor frame
        numFollowFloorBlocks = floorDataReader.readVInt();
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
  /** Initial terms format. */
  public static final int VERSION_START = 0;
  /**
   * Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
   */
  public static final int VERSION_MSB_VLONG_OUTPUT = 1;
  /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_START;
+  public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
  /** Extension of terms index file */
  static final String TERMS_INDEX_EXTENSION = "tip";
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
@ -430,6 +430,25 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
    return brToString(new BytesRef(b));
  }
  /**
   * Encodes long value to variable length byte[], in MSB order. Use {@link
   * FieldReader#readMSBVLong} to decode.
   *
   * <p>Package private for testing
   */
  static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
    assert l >= 0;
    // Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
    // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
    final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
    l <<= Long.SIZE - bytesNeeded * 7;
    for (int i = 1; i < bytesNeeded; i++) {
      scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
      l = l << 7;
    }
    scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
  }
  private static final class PendingBlock extends PendingEntry {
    public final BytesRef prefix;
    public final long fp;
@ -472,10 +491,8 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
      assert scratchBytes.size() == 0;
-      // TODO: try writing the leading vLong in MSB order
+      // write the leading vLong in MSB order for better outputs sharing in the FST
-      // (opposite of what Lucene does today), for better
+      writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
      // outputs sharing in the FST
      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
      if (isFloor) {
        scratchBytes.writeVInt(blocks.size() - 1);
        for (int i = 1; i < blocks.size(); i++) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@ -236,7 +236,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
  SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
      throws IOException {
    scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
-    final long code = scratchReader.readVLong();
+    final long code = fr.readVLongOutput(scratchReader);
    final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
    final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
    f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
@ -980,7 +980,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
          } else if (isSeekFrame && !f.isFloor) {
            final ByteArrayDataInput reader =
                new ByteArrayDataInput(output.bytes, output.offset, output.length);
-            final long codeOrig = reader.readVLong();
+            final long codeOrig = fr.readVLongOutput(reader);
            final long code =
                (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                    | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java
@ -0,0 +1,44 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.codecs.lucene90.blocktree;
 import java.io.IOException;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.ArrayUtil;
 public class TestMSBVLong extends LuceneTestCase {
  public void testMSBVLong() throws IOException {
    assertMSBVLong(Long.MAX_VALUE);
    int iter = atLeast(10000);
    for (long i = 0; i < iter; i++) {
      assertMSBVLong(i);
    }
  }
  private static void assertMSBVLong(long l) throws IOException {
    byte[] bytes = new byte[10];
    ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
    Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
    ByteArrayDataInput in =
        new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
    long recovered = FieldReader.readMSBVLong(in);
    assertEquals(l + " != " + recovered, l, recovered);
  }
 }