LUCENE-9850: Use PFOR encoding for doc IDs (instead of FOR) (#69)

Co-authored-by: Greg Miller <gmiller@amazon.com> Co-authored-by: Adrien Grand <jpountz@gmail.com>
2025-02-28 13:29:26 +00:00 · 2021-04-14 05:36:20 -07:00 · 2021-04-14 05:36:20 -07:00 · fbbdc62913
commit fbbdc62913
parent 0b1d8ccba6
9 changed files with 400 additions and 531 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -208,6 +208,8 @@ Improvements
 * LUCENE-9898: Removes no longer used scorePayload method from BM25Similarity
  (Pieter van Boxtel)

+* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
+
 Bug fixes


--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForDeltaUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForDeltaUtil.java
@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene90;
-
-import java.io.IOException;
-import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.DataOutput;
-import org.apache.lucene.util.packed.PackedInts;
-
-/** Utility class to encode/decode increasing sequences of 128 integers. */
-public class ForDeltaUtil {
-
-  // IDENTITY_PLUS_ONE[i] == i+1
-  private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE];
-
-  static {
-    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
-      IDENTITY_PLUS_ONE[i] = i + 1;
-    }
-  }
-
-  private static void prefixSumOfOnes(long[] arr, long base) {
-    System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
-    // This loop gets auto-vectorized
-    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
-      arr[i] += base;
-    }
-  }
-
-  private final ForUtil forUtil;
-
-  ForDeltaUtil(ForUtil forUtil) {
-    this.forUtil = forUtil;
-  }
-
-  /**
-   * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
-   * longs} are expected to be deltas between consecutive values.
-   */
-  void encodeDeltas(long[] longs, DataOutput out) throws IOException {
-    if (longs[0] == 1 && PForUtil.allEqual(longs)) { // happens with very dense postings
-      out.writeByte((byte) 0);
-    } else {
-      long or = 0;
-      for (long l : longs) {
-        or |= l;
-      }
-      assert or != 0;
-      final int bitsPerValue = PackedInts.bitsRequired(or);
-      out.writeByte((byte) bitsPerValue);
-      forUtil.encode(longs, bitsPerValue, out);
-    }
-  }
-
-  /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
-  void decodeAndPrefixSum(DataInput in, long base, long[] longs) throws IOException {
-    final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
-    if (bitsPerValue == 0) {
-      prefixSumOfOnes(longs, base);
-    } else {
-      forUtil.decodeAndPrefixSum(bitsPerValue, in, base, longs);
-    }
-  }
-
-  /** Skip a sequence of 128 longs. */
-  void skip(DataInput in) throws IOException {
-    final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
-    if (bitsPerValue != 0) {
-      in.skipBytes(forUtil.numBytes(bitsPerValue));
-    }
-  }
-}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java
@ -132,94 +132,6 @@ final class ForUtil {
    }
  }

-  private static void prefixSum8(long[] arr, long base) {
-    expand8To32(arr);
-    prefixSum32(arr, base);
-  }
-
-  private static void prefixSum16(long[] arr, long base) {
-    // We need to move to the next primitive size to avoid overflows
-    expand16To32(arr);
-    prefixSum32(arr, base);
-  }
-
-  private static void prefixSum32(long[] arr, long base) {
-    arr[0] += base << 32;
-    innerPrefixSum32(arr);
-    expand32(arr);
-    final long l = arr[BLOCK_SIZE / 2 - 1];
-    for (int i = BLOCK_SIZE / 2; i < BLOCK_SIZE; ++i) {
-      arr[i] += l;
-    }
-  }
-
-  // For some reason unrolling seems to help
-  private static void innerPrefixSum32(long[] arr) {
-    arr[1] += arr[0];
-    arr[2] += arr[1];
-    arr[3] += arr[2];
-    arr[4] += arr[3];
-    arr[5] += arr[4];
-    arr[6] += arr[5];
-    arr[7] += arr[6];
-    arr[8] += arr[7];
-    arr[9] += arr[8];
-    arr[10] += arr[9];
-    arr[11] += arr[10];
-    arr[12] += arr[11];
-    arr[13] += arr[12];
-    arr[14] += arr[13];
-    arr[15] += arr[14];
-    arr[16] += arr[15];
-    arr[17] += arr[16];
-    arr[18] += arr[17];
-    arr[19] += arr[18];
-    arr[20] += arr[19];
-    arr[21] += arr[20];
-    arr[22] += arr[21];
-    arr[23] += arr[22];
-    arr[24] += arr[23];
-    arr[25] += arr[24];
-    arr[26] += arr[25];
-    arr[27] += arr[26];
-    arr[28] += arr[27];
-    arr[29] += arr[28];
-    arr[30] += arr[29];
-    arr[31] += arr[30];
-    arr[32] += arr[31];
-    arr[33] += arr[32];
-    arr[34] += arr[33];
-    arr[35] += arr[34];
-    arr[36] += arr[35];
-    arr[37] += arr[36];
-    arr[38] += arr[37];
-    arr[39] += arr[38];
-    arr[40] += arr[39];
-    arr[41] += arr[40];
-    arr[42] += arr[41];
-    arr[43] += arr[42];
-    arr[44] += arr[43];
-    arr[45] += arr[44];
-    arr[46] += arr[45];
-    arr[47] += arr[46];
-    arr[48] += arr[47];
-    arr[49] += arr[48];
-    arr[50] += arr[49];
-    arr[51] += arr[50];
-    arr[52] += arr[51];
-    arr[53] += arr[52];
-    arr[54] += arr[53];
-    arr[55] += arr[54];
-    arr[56] += arr[55];
-    arr[57] += arr[56];
-    arr[58] += arr[57];
-    arr[59] += arr[58];
-    arr[60] += arr[59];
-    arr[61] += arr[60];
-    arr[62] += arr[61];
-    arr[63] += arr[62];
-  }
-
  private final long[] tmp = new long[BLOCK_SIZE / 2];

  /** Encode 128 integers from {@code longs} into {@code out}. */
@ -299,7 +211,7 @@ final class ForUtil {
  }

  /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
-  int numBytes(int bitsPerValue) throws IOException {
+  int numBytes(int bitsPerValue) {
    return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
  }

@ -513,109 +425,104 @@ final class ForUtil {
    }
  }

-  /** Delta-decode 128 integers into {@code longs}. */
-  void decodeAndPrefixSum(int bitsPerValue, DataInput in, long base, long[] longs)
-      throws IOException {
+  /**
+   * Decodes 128 integers into 64 {@code longs} such that each long contains two values, each
+   * represented with 32 bits. Values [0..63] are encoded in the high-order bits of {@code longs}
+   * [0..63], and values [64..127] are encoded in the low-order bits of {@code longs} [0..63]. This
+   * representation may allow subsequent operations to be performed on two values at a time.
+   */
+  void decodeTo32(int bitsPerValue, DataInput in, long[] longs) throws IOException {
    switch (bitsPerValue) {
      case 1:
        decode1(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 2:
        decode2(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 3:
        decode3(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 4:
        decode4(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 5:
        decode5(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 6:
        decode6(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 7:
        decode7(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 8:
        decode8(in, tmp, longs);
-        prefixSum8(longs, base);
+        expand8To32(longs);
        break;
      case 9:
        decode9(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 10:
        decode10(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 11:
        decode11(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 12:
        decode12(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 13:
        decode13(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 14:
        decode14(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 15:
        decode15(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 16:
        decode16(in, tmp, longs);
-        prefixSum16(longs, base);
+        expand16To32(longs);
        break;
      case 17:
        decode17(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 18:
        decode18(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 19:
        decode19(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 20:
        decode20(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 21:
        decode21(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 22:
        decode22(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 23:
        decode23(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      case 24:
        decode24(in, tmp, longs);
-        prefixSum32(longs, base);
        break;
      default:
        decodeSlow(bitsPerValue, in, tmp, longs);
-        prefixSum32(longs, base);
        break;
    }
  }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java
@ -310,9 +310,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {

  final class BlockDocsEnum extends PostingsEnum {

-    final ForUtil forUtil = new ForUtil();
-    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
-    final PForUtil pforUtil = new PForUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

    private final long[] docBuffer = new long[BLOCK_SIZE + 1];
    private final long[] freqBuffer = new long[BLOCK_SIZE];
@ -458,7 +456,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
-        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+        pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer);

        if (indexHasFreq) {
          if (needsFreq) {
@ -569,9 +567,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
  // Also handles payloads + offsets
  final class EverythingEnum extends PostingsEnum {

-    final ForUtil forUtil = new ForUtil();
-    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
-    final PForUtil pforUtil = new PForUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

    private final long[] docBuffer = new long[BLOCK_SIZE + 1];
    private final long[] freqBuffer = new long[BLOCK_SIZE + 1];
@ -759,7 +755,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
-        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+        pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
        pforUtil.decode(docIn, freqBuffer);
        blockUpto += BLOCK_SIZE;
      } else if (docFreq == 1) {
@ -1055,9 +1051,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {

  final class BlockImpactsDocsEnum extends ImpactsEnum {

-    final ForUtil forUtil = new ForUtil();
-    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
-    final PForUtil pforUtil = new PForUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

    private final long[] docBuffer = new long[BLOCK_SIZE + 1];
    private final long[] freqBuffer = new long[BLOCK_SIZE];
@ -1151,7 +1145,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
-        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+        pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
        if (indexHasFreqs) {
          pforUtil.decode(docIn, freqBuffer);
        }
@ -1250,9 +1244,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {

  final class BlockImpactsPostingsEnum extends ImpactsEnum {

-    final ForUtil forUtil = new ForUtil();
-    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
-    final PForUtil pforUtil = new PForUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

    private final long[] docBuffer = new long[BLOCK_SIZE];
    private final long[] freqBuffer = new long[BLOCK_SIZE];
@ -1364,7 +1356,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
-        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+        pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
        pforUtil.decode(docIn, freqBuffer);
      } else {
        readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
@ -1544,9 +1536,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {

  final class BlockImpactsEverythingEnum extends ImpactsEnum {

-    final ForUtil forUtil = new ForUtil();
-    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
-    final PForUtil pforUtil = new PForUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

    private final long[] docBuffer = new long[BLOCK_SIZE];
    private final long[] freqBuffer = new long[BLOCK_SIZE];
@ -1755,7 +1745,7 @@ public final class Lucene90PostingsReader extends PostingsReaderBase {
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
-        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+        pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
        if (indexHasFreq) {
          isFreqsRead =
              false; // freq block will be loaded lazily when necessary, we don't load it here
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java
@ -91,7 +91,6 @@ public final class Lucene90PostingsWriter extends PushPostingsWriterBase {
  private int docCount;

  private final PForUtil pforUtil;
-  private final ForDeltaUtil forDeltaUtil;
  private final Lucene90SkipWriter skipWriter;

  private boolean fieldHasNorms;
@ -120,9 +119,7 @@ public final class Lucene90PostingsWriter extends PushPostingsWriterBase {
      } else {
        throw new Error();
      }
-      final ForUtil forUtil = new ForUtil();
-      forDeltaUtil = new ForDeltaUtil(forUtil);
-      pforUtil = new PForUtil(forUtil);
+      pforUtil = new PForUtil(new ForUtil());
      if (state.fieldInfos.hasProx()) {
        posDeltaBuffer = new long[BLOCK_SIZE];
        String posFileName =
@ -252,7 +249,7 @@ public final class Lucene90PostingsWriter extends PushPostingsWriterBase {
    docCount++;

    if (docBufferUpto == BLOCK_SIZE) {
-      forDeltaUtil.encodeDeltas(docDeltaBuffer, docOut);
+      pforUtil.encode(docDeltaBuffer, docOut);
      if (writeFreqs) {
        pforUtil.encode(freqBuffer, docOut);
      }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java
@ -25,6 +25,18 @@ import org.apache.lucene.util.packed.PackedInts;
 /** Utility class to encode sequences of 128 small positive integers. */
 final class PForUtil {

+  private static final int MAX_EXCEPTIONS = 7;
+  private static final int HALF_BLOCK_SIZE = ForUtil.BLOCK_SIZE / 2;
+
+  // IDENTITY_PLUS_ONE[i] == i + 1
+  private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE];
+
+  static {
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
+      IDENTITY_PLUS_ONE[i] = i + 1;
+    }
+  }
+
  static boolean allEqual(long[] l) {
    for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
      if (l[i] != l[0]) {
@ -35,6 +47,9 @@ final class PForUtil {
  }

  private final ForUtil forUtil;
+  // buffer for reading exception data; each exception uses two bytes (pos + high-order bits of the
+  // exception)
+  private final byte[] exceptionBuff = new byte[MAX_EXCEPTIONS * 2];

  PForUtil(ForUtil forUtil) {
    assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
@ -43,24 +58,25 @@ final class PForUtil {

  /** Encode 128 integers from {@code longs} into {@code out}. */
  void encode(long[] longs, DataOutput out) throws IOException {
-    // At most 7 exceptions
-    final long[] top8 = new long[8];
-    Arrays.fill(top8, -1L);
+    // Determine the top MAX_EXCEPTIONS + 1 values
+    final long[] top = new long[MAX_EXCEPTIONS + 1];
+    Arrays.fill(top, -1L);
    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
-      if (longs[i] > top8[0]) {
-        top8[0] = longs[i];
+      if (longs[i] > top[0]) {
+        top[0] = longs[i];
        Arrays.sort(
-            top8); // For only 8 entries we just sort on every iteration instead of maintaining a PQ
+            top); // For only a small number of entries we just sort on every iteration instead of
+        // maintaining a PQ
      }
    }

-    final int maxBitsRequired = PackedInts.bitsRequired(top8[7]);
+    final int maxBitsRequired = PackedInts.bitsRequired(top[MAX_EXCEPTIONS]);
    // We store the patch on a byte, so we can't decrease the number of bits required by more than 8
-    final int patchedBitsRequired = Math.max(PackedInts.bitsRequired(top8[0]), maxBitsRequired - 8);
+    final int patchedBitsRequired = Math.max(PackedInts.bitsRequired(top[0]), maxBitsRequired - 8);
    int numExceptions = 0;
    final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1;
    for (int i = 1; i < 8; ++i) {
-      if (top8[i] > maxUnpatchedValue) {
+      if (top[i] > maxUnpatchedValue) {
        numExceptions++;
      }
    }
@ -109,6 +125,40 @@ final class PForUtil {
    }
  }

+  /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
+  void decodeAndPrefixSum(DataInput in, long base, long[] longs) throws IOException {
+    final int token = Byte.toUnsignedInt(in.readByte());
+    final int bitsPerValue = token & 0x1f;
+    final int numExceptions = token >>> 5;
+    if (numExceptions == 0) {
+      // when there are no exceptions to apply, we can be a bit more efficient with our decoding
+      if (bitsPerValue == 0) {
+        // a bpv of zero indicates all delta values are the same
+        long val = in.readVLong();
+        if (val == 1) {
+          // this will often be the common case when working with doc IDs, so we special-case it to
+          // be slightly more efficient
+          prefixSumOfOnes(longs, base);
+        } else {
+          prefixSumOf(longs, base, val);
+        }
+      } else {
+        // decode the deltas then apply the prefix sum logic
+        forUtil.decodeTo32(bitsPerValue, in, longs);
+        prefixSum32(longs, base);
+      }
+    } else {
+      // pack two values per long so we can apply prefixes two-at-a-time
+      if (bitsPerValue == 0) {
+        fillSameValue32(longs, in.readVLong());
+      } else {
+        forUtil.decodeTo32(bitsPerValue, in, longs);
+      }
+      applyExceptions32(bitsPerValue, numExceptions, in, longs);
+      prefixSum32(longs, base);
+    }
+  }
+
  /** Skip 128 integers. */
  void skip(DataInput in) throws IOException {
    final int token = Byte.toUnsignedInt(in.readByte());
@ -121,4 +171,146 @@ final class PForUtil {
      in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1));
    }
  }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being 1. Note this assumes
+   * there are no exceptions to apply.
+   */
+  private static void prefixSumOfOnes(long[] longs, long base) {
+    System.arraycopy(IDENTITY_PLUS_ONE, 0, longs, 0, ForUtil.BLOCK_SIZE);
+    // This loop gets auto-vectorized
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
+      longs[i] += base;
+    }
+  }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being {@code val}. Note
+   * this assumes there are no exceptions to apply.
+   */
+  private static void prefixSumOf(long[] longs, long base, long val) {
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; i++) {
+      longs[i] = (i + 1) * val + base;
+    }
+  }
+
+  /**
+   * Fills the {@code longs} with the provided {@code val}, packed two values per long (using 32
+   * bits per value).
+   */
+  private static void fillSameValue32(long[] longs, long val) {
+    final long token = val << 32 | val;
+    Arrays.fill(longs, 0, HALF_BLOCK_SIZE, token);
+  }
+
+  /** Apply the exceptions where the values are packed two-per-long in {@code longs}. */
+  private void applyExceptions32(int bitsPerValue, int numExceptions, DataInput in, long[] longs)
+      throws IOException {
+    in.readBytes(exceptionBuff, 0, numExceptions * 2);
+    for (int i = 0; i < numExceptions; ++i) {
+      final int exceptionPos = Byte.toUnsignedInt(exceptionBuff[i * 2]);
+      final long exception = Byte.toUnsignedLong(exceptionBuff[i * 2 + 1]);
+      // note that we pack two values per long, so the index is [0..63] for 128 values
+      final int idx = exceptionPos & 0x3f; // mod 64
+      // we need to shift by 1) the bpv, and 2) 32 for positions [0..63] (and no 32 shift for
+      // [64..127])
+      final int shift = bitsPerValue + ((1 ^ (exceptionPos >>> 6)) << 5);
+      longs[idx] |= exception << shift;
+    }
+  }
+
+  /** Apply prefix sum logic where the values are packed two-per-long in {@code longs}. */
+  private static void prefixSum32(long[] longs, long base) {
+    longs[0] += base << 32;
+    innerPrefixSum32(longs);
+    expand32(longs);
+    final long l = longs[HALF_BLOCK_SIZE - 1];
+    for (int i = HALF_BLOCK_SIZE; i < ForUtil.BLOCK_SIZE; ++i) {
+      longs[i] += l;
+    }
+  }
+
+  /**
+   * Expand the values packed two-per-long in {@code longs} into 128 individual long values stored
+   * back into {@code longs}.
+   */
+  private static void expand32(long[] longs) {
+    for (int i = 0; i < 64; ++i) {
+      final long l = longs[i];
+      longs[i] = l >>> 32;
+      longs[64 + i] = l & 0xFFFFFFFFL;
+    }
+  }
+
+  /**
+   * Unrolled "inner" prefix sum logic where the values are packed two-per-long in {@code longs}.
+   * After this method, the final values will be correct for all high-order bits (values [0..63])
+   * but a final prefix loop will still need to run to "correct" the values of [64..127] in the
+   * low-order bits, which need the 64th value added to all of them.
+   */
+  private static void innerPrefixSum32(long[] longs) {
+    longs[1] += longs[0];
+    longs[2] += longs[1];
+    longs[3] += longs[2];
+    longs[4] += longs[3];
+    longs[5] += longs[4];
+    longs[6] += longs[5];
+    longs[7] += longs[6];
+    longs[8] += longs[7];
+    longs[9] += longs[8];
+    longs[10] += longs[9];
+    longs[11] += longs[10];
+    longs[12] += longs[11];
+    longs[13] += longs[12];
+    longs[14] += longs[13];
+    longs[15] += longs[14];
+    longs[16] += longs[15];
+    longs[17] += longs[16];
+    longs[18] += longs[17];
+    longs[19] += longs[18];
+    longs[20] += longs[19];
+    longs[21] += longs[20];
+    longs[22] += longs[21];
+    longs[23] += longs[22];
+    longs[24] += longs[23];
+    longs[25] += longs[24];
+    longs[26] += longs[25];
+    longs[27] += longs[26];
+    longs[28] += longs[27];
+    longs[29] += longs[28];
+    longs[30] += longs[29];
+    longs[31] += longs[30];
+    longs[32] += longs[31];
+    longs[33] += longs[32];
+    longs[34] += longs[33];
+    longs[35] += longs[34];
+    longs[36] += longs[35];
+    longs[37] += longs[36];
+    longs[38] += longs[37];
+    longs[39] += longs[38];
+    longs[40] += longs[39];
+    longs[41] += longs[40];
+    longs[42] += longs[41];
+    longs[43] += longs[42];
+    longs[44] += longs[43];
+    longs[45] += longs[44];
+    longs[46] += longs[45];
+    longs[47] += longs[46];
+    longs[48] += longs[47];
+    longs[49] += longs[48];
+    longs[50] += longs[49];
+    longs[51] += longs[50];
+    longs[52] += longs[51];
+    longs[53] += longs[52];
+    longs[54] += longs[53];
+    longs[55] += longs[54];
+    longs[56] += longs[55];
+    longs[57] += longs[56];
+    longs[58] += longs[57];
+    longs[59] += longs[58];
+    longs[60] += longs[59];
+    longs[61] += longs[60];
+    longs[62] += longs[61];
+    longs[63] += longs[62];
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/gen_ForUtil.py
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/gen_ForUtil.py
@ -40,10 +40,9 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.codecs.lucene84;
+package org.apache.lucene.codecs.lucene90;

 import java.io.IOException;
-
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;

@ -85,13 +84,13 @@ final class ForUtil {
    for (int i = 0; i < 16; ++i) {
      long l = arr[i];
      arr[i] = (l >>> 56) & 0xFFL;
-      arr[16+i] = (l >>> 48) & 0xFFL;
-      arr[32+i] = (l >>> 40) & 0xFFL;
-      arr[48+i] = (l >>> 32) & 0xFFL;
-      arr[64+i] = (l >>> 24) & 0xFFL;
-      arr[80+i] = (l >>> 16) & 0xFFL;
-      arr[96+i] = (l >>> 8) & 0xFFL;
-      arr[112+i] = l & 0xFFL;
+      arr[16 + i] = (l >>> 48) & 0xFFL;
+      arr[32 + i] = (l >>> 40) & 0xFFL;
+      arr[48 + i] = (l >>> 32) & 0xFFL;
+      arr[64 + i] = (l >>> 24) & 0xFFL;
+      arr[80 + i] = (l >>> 16) & 0xFFL;
+      arr[96 + i] = (l >>> 8) & 0xFFL;
+      arr[112 + i] = l & 0xFFL;
    }
  }

@ -99,15 +98,23 @@ final class ForUtil {
    for (int i = 0; i < 16; ++i) {
      long l = arr[i];
      arr[i] = (l >>> 24) & 0x000000FF000000FFL;
-      arr[16+i] = (l >>> 16) & 0x000000FF000000FFL;
-      arr[32+i] = (l >>> 8) & 0x000000FF000000FFL;
-      arr[48+i] = l & 0x000000FF000000FFL;
+      arr[16 + i] = (l >>> 16) & 0x000000FF000000FFL;
+      arr[32 + i] = (l >>> 8) & 0x000000FF000000FFL;
+      arr[48 + i] = l & 0x000000FF000000FFL;
    }
  }

  private static void collapse8(long[] arr) {
    for (int i = 0; i < 16; ++i) {
-      arr[i] = (arr[i] << 56) | (arr[16+i] << 48) | (arr[32+i] << 40) | (arr[48+i] << 32) | (arr[64+i] << 24) | (arr[80+i] << 16) | (arr[96+i] << 8) | arr[112+i];
+      arr[i] =
+          (arr[i] << 56)
+              | (arr[16 + i] << 48)
+              | (arr[32 + i] << 40)
+              | (arr[48 + i] << 32)
+              | (arr[64 + i] << 24)
+              | (arr[80 + i] << 16)
+              | (arr[96 + i] << 8)
+              | arr[112 + i];
    }
  }

@ -115,9 +122,9 @@ final class ForUtil {
    for (int i = 0; i < 32; ++i) {
      long l = arr[i];
      arr[i] = (l >>> 48) & 0xFFFFL;
-      arr[32+i] = (l >>> 32) & 0xFFFFL;
-      arr[64+i] = (l >>> 16) & 0xFFFFL;
-      arr[96+i] = l & 0xFFFFL;
+      arr[32 + i] = (l >>> 32) & 0xFFFFL;
+      arr[64 + i] = (l >>> 16) & 0xFFFFL;
+      arr[96 + i] = l & 0xFFFFL;
    }
  }

@ -125,13 +132,13 @@ final class ForUtil {
    for (int i = 0; i < 32; ++i) {
      long l = arr[i];
      arr[i] = (l >>> 16) & 0x0000FFFF0000FFFFL;
-      arr[32+i] = l & 0x0000FFFF0000FFFFL;
+      arr[32 + i] = l & 0x0000FFFF0000FFFFL;
    }
  }

  private static void collapse16(long[] arr) {
    for (int i = 0; i < 32; ++i) {
-      arr[i] = (arr[i] << 48) | (arr[32+i] << 32) | (arr[64+i] << 16) | arr[96+i];
+      arr[i] = (arr[i] << 48) | (arr[32 + i] << 32) | (arr[64 + i] << 16) | arr[96 + i];
    }
  }

@ -145,103 +152,13 @@ final class ForUtil {

  private static void collapse32(long[] arr) {
    for (int i = 0; i < 64; ++i) {
-      arr[i] = (arr[i] << 32) | arr[64+i];
+      arr[i] = (arr[i] << 32) | arr[64 + i];
    }
  }

-  private static void prefixSum8(long[] arr, long base) {
-    expand8To32(arr);
-    prefixSum32(arr, base);
-  }
+  private final long[] tmp = new long[BLOCK_SIZE / 2];

-  private static void prefixSum16(long[] arr, long base) {
-    // We need to move to the next primitive size to avoid overflows
-    expand16To32(arr);
-    prefixSum32(arr, base);
-  }
-
-  private static void prefixSum32(long[] arr, long base) {
-    arr[0] += base << 32;
-    innerPrefixSum32(arr);
-    expand32(arr);
-    final long l = arr[BLOCK_SIZE/2-1];
-    for (int i = BLOCK_SIZE/2; i < BLOCK_SIZE; ++i) {
-      arr[i] += l;
-    }
-  }
-
-  // For some reason unrolling seems to help
-  private static void innerPrefixSum32(long[] arr) {
-    arr[1] += arr[0];
-    arr[2] += arr[1];
-    arr[3] += arr[2];
-    arr[4] += arr[3];
-    arr[5] += arr[4];
-    arr[6] += arr[5];
-    arr[7] += arr[6];
-    arr[8] += arr[7];
-    arr[9] += arr[8];
-    arr[10] += arr[9];
-    arr[11] += arr[10];
-    arr[12] += arr[11];
-    arr[13] += arr[12];
-    arr[14] += arr[13];
-    arr[15] += arr[14];
-    arr[16] += arr[15];
-    arr[17] += arr[16];
-    arr[18] += arr[17];
-    arr[19] += arr[18];
-    arr[20] += arr[19];
-    arr[21] += arr[20];
-    arr[22] += arr[21];
-    arr[23] += arr[22];
-    arr[24] += arr[23];
-    arr[25] += arr[24];
-    arr[26] += arr[25];
-    arr[27] += arr[26];
-    arr[28] += arr[27];
-    arr[29] += arr[28];
-    arr[30] += arr[29];
-    arr[31] += arr[30];
-    arr[32] += arr[31];
-    arr[33] += arr[32];
-    arr[34] += arr[33];
-    arr[35] += arr[34];
-    arr[36] += arr[35];
-    arr[37] += arr[36];
-    arr[38] += arr[37];
-    arr[39] += arr[38];
-    arr[40] += arr[39];
-    arr[41] += arr[40];
-    arr[42] += arr[41];
-    arr[43] += arr[42];
-    arr[44] += arr[43];
-    arr[45] += arr[44];
-    arr[46] += arr[45];
-    arr[47] += arr[46];
-    arr[48] += arr[47];
-    arr[49] += arr[48];
-    arr[50] += arr[49];
-    arr[51] += arr[50];
-    arr[52] += arr[51];
-    arr[53] += arr[52];
-    arr[54] += arr[53];
-    arr[55] += arr[54];
-    arr[56] += arr[55];
-    arr[57] += arr[56];
-    arr[58] += arr[57];
-    arr[59] += arr[58];
-    arr[60] += arr[59];
-    arr[61] += arr[60];
-    arr[62] += arr[61];
-    arr[63] += arr[62];
-  }
-
-  private final long[] tmp = new long[BLOCK_SIZE/2];
-
-  /**
-   * Encode 128 integers from {@code longs} into {@code out}.
-   */
+  /** Encode 128 integers from {@code longs} into {@code out}. */
  void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException {
    final int nextPrimitive;
    final int numLongs;
@ -310,20 +227,20 @@ final class ForUtil {
    }

    for (int i = 0; i < numLongsPerShift; ++i) {
-      // Java longs are big endian and we want to read little endian longs, so we need to reverse bytes
+      // Java longs are big endian and we want to read little endian longs, so we need to reverse
+      // bytes
      long l = Long.reverseBytes(tmp[i]);
      out.writeLong(l);
    }
  }

-  /**
-   * Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value.
-   */
-  int numBytes(int bitsPerValue) throws IOException {
+  /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
+  int numBytes(int bitsPerValue) {
    return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
  }

-  private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs) throws IOException {
+  private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs)
+      throws IOException {
    final int numLongs = bitsPerValue << 1;
    in.readLELongs(tmp, 0, numLongs);
    final long mask = MASKS32[bitsPerValue];
@ -345,7 +262,7 @@ final class ForUtil {
        l |= (tmp[tmpIdx++] & mask32RemainingBitsPerLong) << b;
      }
      if (b > 0) {
-        l |= (tmp[tmpIdx] >>> (remainingBitsPerLong-b)) & MASKS32[b];
+        l |= (tmp[tmpIdx] >>> (remainingBitsPerLong - b)) & MASKS32[b];
        remainingBits = remainingBitsPerLong - b;
      } else {
        remainingBits = remainingBitsPerLong;
@ -355,13 +272,12 @@ final class ForUtil {
  }

  /**
-   * The pattern that this shiftLongs method applies is recognized by the C2
-   * compiler, which generates SIMD instructions for it in order to shift
-   * multiple longs at once.
+   * The pattern that this shiftLongs method applies is recognized by the C2 compiler, which
+   * generates SIMD instructions for it in order to shift multiple longs at once.
   */
  private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, long mask) {
    for (int i = 0; i < count; ++i) {
-      b[bi+i] = (a[i] >>> shift) & mask;
+      b[bi + i] = (a[i] >>> shift) & mask;
    }
  }

@ -375,19 +291,18 @@ def writeRemainderWithSIMDOptimize(bpv, next_primitive, remaining_bits_per_long,
    num_values /= 2
    iteration *= 2

-
  f.write('    shiftLongs(tmp, %d, tmp, 0, 0, MASK%d_%d);\n' % (iteration * num_longs, next_primitive, remaining_bits_per_long))
  f.write('    for (int iter = 0, tmpIdx = 0, longsIdx = %d; iter < %d; ++iter, tmpIdx += %d, longsIdx += %d) {\n' %(o, iteration, num_longs, num_values))
  tmp_idx = 0
  b = bpv
  b -= remaining_bits_per_long
-  f.write('      long l0 = tmp[tmpIdx+%d] << %d;\n' %(tmp_idx, b))
+  f.write('      long l0 = tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b))
  tmp_idx += 1
  while b >= remaining_bits_per_long:
    b -= remaining_bits_per_long
-    f.write('      l0 |= tmp[tmpIdx+%d] << %d;\n' %(tmp_idx, b))
+    f.write('      l0 |= tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b))
    tmp_idx += 1
-  f.write('      longs[longsIdx+0] = l0;\n')
+  f.write('      longs[longsIdx + 0] = l0;\n')
  f.write('    }\n')
  

@ -406,22 +321,21 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values,
    b = bpv
    if remaining_bits == 0:
      b -= remaining_bits_per_long
-      f.write('      long l%d = (tmp[tmpIdx+%d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
+      f.write('      long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
    else:
      b -= remaining_bits
-      f.write('      long l%d = (tmp[tmpIdx+%d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
+      f.write('      long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
    tmp_idx += 1
    while b >= remaining_bits_per_long:
      b -= remaining_bits_per_long
-      f.write('      l%d |= (tmp[tmpIdx+%d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
+      f.write('      l%d |= (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
      tmp_idx += 1
    if b > 0:
-      f.write('      l%d |= (tmp[tmpIdx+%d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_long-b, next_primitive, b))
+      f.write('      l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_long-b, next_primitive, b))
      remaining_bits = remaining_bits_per_long-b
-    f.write('      longs[longsIdx+%d] = l%d;\n' %(i, i))
+    f.write('      longs[longsIdx + %d] = l%d;\n' %(i, i))
  f.write('    }\n')
  
-  

 def writeDecode(bpv, f):
  next_primitive = 32
@ -447,30 +361,31 @@ def writeDecode(bpv, f):
      else:
        writeRemainder(bpv, next_primitive, shift + bpv, o, 128/num_values_per_long - o, f)
  f.write('  }\n')
-  f.write('\n')
+

 if __name__ == '__main__':
  f = open(OUTPUT_FILE, 'w')
  f.write(HEADER)
  for primitive_size in PRIMITIVE_SIZE:
    f.write('  private static final long[] MASKS%d = new long[%d];\n' %(primitive_size, primitive_size))
+  f.write('\n')
  f.write('  static {\n')
  for primitive_size in PRIMITIVE_SIZE:
    f.write('    for (int i = 0; i < %d; ++i) {\n' %primitive_size)
    f.write('      MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size))
    f.write('    }\n')
-  f.write('  }\n')
-  f.write('  //mark values in array as final longs to avoid the cost of reading array, arrays should only be used when the idx is a variable\n')
+  f.write('  }')
+  f.write("""
+  // mark values in array as final longs to avoid the cost of reading array, arrays should only be
+  // used when the idx is a variable
+""")
  for primitive_size in PRIMITIVE_SIZE:
    for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)):
      if bpv * 2 != primitive_size or primitive_size == 8:
        f.write('  private static final long MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv))
-  f.write('\n')

  f.write("""
-  /**
-   * Decode 128 integers into {@code longs}.
-   */
+  /** Decode 128 integers into {@code longs}. */
  void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException {
    switch (bitsPerValue) {
 """)
@ -480,43 +395,48 @@ if __name__ == '__main__':
      next_primitive = 8
    elif bpv <= 16:
      next_primitive = 16
-    f.write('    case %d:\n' %bpv)
-    f.write('      decode%d(in, tmp, longs);\n' %bpv)
-    f.write('      expand%d(longs);\n' %next_primitive)
-    f.write('      break;\n')
-  f.write('    default:\n')
-  f.write('      decodeSlow(bitsPerValue, in, tmp, longs);\n')
-  f.write('      expand32(longs);\n')
-  f.write('      break;\n')
+    f.write('      case %d:\n' %bpv)
+    f.write('        decode%d(in, tmp, longs);\n' %bpv)
+    f.write('        expand%d(longs);\n' %next_primitive)
+    f.write('        break;\n')
+  f.write('      default:\n')
+  f.write('        decodeSlow(bitsPerValue, in, tmp, longs);\n')
+  f.write('        expand32(longs);\n')
+  f.write('        break;\n')
  f.write('    }\n')
  f.write('  }\n')

  f.write("""
  /**
-   * Delta-decode 128 integers into {@code longs}.
+   * Decodes 128 integers into 64 {@code longs} such that each long contains two values, each
+   * represented with 32 bits. Values [0..63] are encoded in the high-order bits of {@code longs}
+   * [0..63], and values [64..127] are encoded in the low-order bits of {@code longs} [0..63]. This
+   * representation may allow subsequent operations to be performed on two values at a time.
   */
-  void decodeAndPrefixSum(int bitsPerValue, DataInput in, long base, long[] longs) throws IOException {
+  void decodeTo32(int bitsPerValue, DataInput in, long[] longs) throws IOException {
    switch (bitsPerValue) {
 """)
  for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
-    next_primitive = 32 
+    next_primitive = 32
    if bpv <= 8:
-      next_primitive = 8 
+      next_primitive = 8
    elif bpv <= 16:
      next_primitive = 16
-    f.write('    case %d:\n' %bpv)
-    f.write('      decode%d(in, tmp, longs);\n' %bpv)
-    f.write('      prefixSum%d(longs, base);\n' %next_primitive)
-    f.write('      break;\n')
-  f.write('    default:\n')
-  f.write('      decodeSlow(bitsPerValue, in, tmp, longs);\n')
-  f.write('      prefixSum32(longs, base);\n')
-  f.write('      break;\n')
+    f.write('      case %d:\n' %bpv)
+    f.write('        decode%d(in, tmp, longs);\n' %bpv)
+    if next_primitive <= 16:
+      f.write('        expand%dTo32(longs);\n' %next_primitive)
+    f.write('        break;\n')
+  f.write('      default:\n')
+  f.write('        decodeSlow(bitsPerValue, in, tmp, longs);\n')
+  f.write('        break;\n')
  f.write('    }\n')
  f.write('  }\n')

  f.write('\n')
  for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
    writeDecode(i, f)
+    if i < MAX_SPECIALIZED_BITS_PER_VALUE:
+      f.write('\n')

  f.write('}\n')
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestForDeltaUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestForDeltaUtil.java
@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene90;
-
-import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
-import java.io.IOException;
-import java.util.Arrays;
-import org.apache.lucene.store.ByteBuffersDirectory;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.packed.PackedInts;
-
-public class TestForDeltaUtil extends LuceneTestCase {
-
-  public void testEncodeDecode() throws IOException {
-    final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
-    final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
-
-    for (int i = 0; i < iterations; ++i) {
-      final int bpv = TestUtil.nextInt(random(), 1, 31 - 7);
-      for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
-        values[i * ForUtil.BLOCK_SIZE + j] =
-            RandomNumbers.randomIntBetween(random(), 1, (int) PackedInts.maxValue(bpv));
-      }
-    }
-
-    final Directory d = new ByteBuffersDirectory();
-    final long endPointer;
-
-    {
-      // encode
-      IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
-      final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(new ForUtil());
-
-      for (int i = 0; i < iterations; ++i) {
-        long[] source = new long[ForUtil.BLOCK_SIZE];
-        for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
-          source[j] = values[i * ForUtil.BLOCK_SIZE + j];
-        }
-        forDeltaUtil.encodeDeltas(source, out);
-      }
-      endPointer = out.getFilePointer();
-      out.close();
-    }
-
-    {
-      // decode
-      IndexInput in = d.openInput("test.bin", IOContext.READONCE);
-      final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(new ForUtil());
-      for (int i = 0; i < iterations; ++i) {
-        if (random().nextInt(5) == 0) {
-          forDeltaUtil.skip(in);
-          continue;
-        }
-        long base = 0;
-        final long[] restored = new long[ForUtil.BLOCK_SIZE];
-        forDeltaUtil.decodeAndPrefixSum(in, base, restored);
-        final long[] expected = new long[ForUtil.BLOCK_SIZE];
-        for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
-          expected[j] = values[i * ForUtil.BLOCK_SIZE + j];
-          if (j > 0) {
-            expected[j] += expected[j - 1];
-          } else {
-            expected[j] += base;
-          }
-        }
-        assertArrayEquals(Arrays.toString(restored), expected, restored);
-      }
-      assertEquals(endPointer, in.getFilePointer());
-      in.close();
-    }
-
-    d.close();
-  }
-}
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestPForUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestPForUtil.java
@ -33,68 +33,108 @@ public class TestPForUtil extends LuceneTestCase {

  public void testEncodeDecode() throws IOException {
    final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
+    final int[] values = createTestData(iterations, 31);
+
+    final Directory d = new ByteBuffersDirectory();
+    final long endPointer = encodeTestData(iterations, values, d);
+
+    IndexInput in = d.openInput("test.bin", IOContext.READONCE);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());
+    for (int i = 0; i < iterations; ++i) {
+      if (random().nextInt(5) == 0) {
+        pforUtil.skip(in);
+        continue;
+      }
+      final long[] restored = new long[ForUtil.BLOCK_SIZE];
+      pforUtil.decode(in, restored);
+      int[] ints = new int[ForUtil.BLOCK_SIZE];
+      for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
+        ints[j] = Math.toIntExact(restored[j]);
+      }
+      assertArrayEquals(
+          Arrays.toString(ints),
+          ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
+          ints);
+    }
+    assertEquals(endPointer, in.getFilePointer());
+    in.close();
+
+    d.close();
+  }
+
+  public void testDeltaEncodeDecode() throws IOException {
+    final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
+    // cap at 31 - 7 bpv to ensure we don't overflow when working with deltas (i.e., 128 24 bit
+    // values treated as deltas will result in a final value that can fit in 31 bits)
+    final int[] values = createTestData(iterations, 31 - 7);
+
+    final Directory d = new ByteBuffersDirectory();
+    final long endPointer = encodeTestData(iterations, values, d);
+
+    IndexInput in = d.openInput("test.bin", IOContext.READONCE);
+    final PForUtil pForUtil = new PForUtil(new ForUtil());
+    for (int i = 0; i < iterations; ++i) {
+      if (random().nextInt(5) == 0) {
+        pForUtil.skip(in);
+        continue;
+      }
+      long base = 0;
+      final long[] restored = new long[ForUtil.BLOCK_SIZE];
+      pForUtil.decodeAndPrefixSum(in, base, restored);
+      final long[] expected = new long[ForUtil.BLOCK_SIZE];
+      for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
+        expected[j] = values[i * ForUtil.BLOCK_SIZE + j];
+        if (j > 0) {
+          expected[j] += expected[j - 1];
+        } else {
+          expected[j] += base;
+        }
+      }
+      assertArrayEquals(Arrays.toString(restored), expected, restored);
+    }
+    assertEquals(endPointer, in.getFilePointer());
+    in.close();
+
+    d.close();
+  }
+
+  private int[] createTestData(int iterations, int maxBpv) {
    final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];

    for (int i = 0; i < iterations; ++i) {
-      final int bpv = TestUtil.nextInt(random(), 0, 31);
+      final int bpv = TestUtil.nextInt(random(), 0, maxBpv);
      for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
        values[i * ForUtil.BLOCK_SIZE + j] =
            RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
        if (random().nextInt(100) == 0) {
          final int exceptionBpv;
          if (random().nextInt(10) == 0) {
-            exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), 31);
+            exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv);
          } else {
-            exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), 31);
+            exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv);
          }
          values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv;
        }
      }
    }

-    final Directory d = new ByteBuffersDirectory();
-    final long endPointer;
+    return values;
+  }

-    {
-      // encode
-      IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
-      final PForUtil pforUtil = new PForUtil(new ForUtil());
+  private long encodeTestData(int iterations, int[] values, Directory d) throws IOException {
+    IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
+    final PForUtil pforUtil = new PForUtil(new ForUtil());

-      for (int i = 0; i < iterations; ++i) {
-        long[] source = new long[ForUtil.BLOCK_SIZE];
-        for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
-          source[j] = values[i * ForUtil.BLOCK_SIZE + j];
-        }
-        pforUtil.encode(source, out);
+    for (int i = 0; i < iterations; ++i) {
+      long[] source = new long[ForUtil.BLOCK_SIZE];
+      for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
+        source[j] = values[i * ForUtil.BLOCK_SIZE + j];
      }
-      endPointer = out.getFilePointer();
-      out.close();
+      pforUtil.encode(source, out);
    }
+    final long endPointer = out.getFilePointer();
+    out.close();

-    {
-      // decode
-      IndexInput in = d.openInput("test.bin", IOContext.READONCE);
-      final PForUtil pforUtil = new PForUtil(new ForUtil());
-      for (int i = 0; i < iterations; ++i) {
-        if (random().nextInt(5) == 0) {
-          pforUtil.skip(in);
-          continue;
-        }
-        final long[] restored = new long[ForUtil.BLOCK_SIZE];
-        pforUtil.decode(in, restored);
-        int[] ints = new int[ForUtil.BLOCK_SIZE];
-        for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
-          ints[j] = Math.toIntExact(restored[j]);
-        }
-        assertArrayEquals(
-            Arrays.toString(ints),
-            ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
-            ints);
-      }
-      assertEquals(endPointer, in.getFilePointer());
-      in.close();
-    }
-
-    d.close();
+    return endPointer;
  }
 }