Improve checksum calculations (#13989)

Take advantage of the existing buffer in BufferedChecksum to speed up
reads for Longs, Ints, Shorts and Long arrays by avoiding byte-by-byte
reads.
This commit is contained in:
Jean-François BOEUF 2024-11-25 15:59:15 +01:00 committed by Adrien Grand
parent d9c3bc875b
commit 71715b59e8
6 changed files with 210 additions and 8 deletions

View File

@ -79,6 +79,8 @@ Optimizations
* GITHUB#13999: CombinedFieldQuery now returns non-infinite maximum scores, * GITHUB#13999: CombinedFieldQuery now returns non-infinite maximum scores,
making it eligible to dynamic pruning. (Adrien Grand) making it eligible to dynamic pruning. (Adrien Grand)
* GITHUB#13989: Faster checksum computation. (Jean-François Boeuf)
Bug Fixes Bug Fixes
--------------------- ---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended

View File

@ -196,9 +196,7 @@ public class FuzzySet implements Accountable {
int bloomSize = in.readInt(); int bloomSize = in.readInt();
int numLongs = in.readInt(); int numLongs = in.readInt();
long[] longs = new long[numLongs]; long[] longs = new long[numLongs];
for (int i = 0; i < numLongs; i++) { in.readLongs(longs, 0, numLongs);
longs[i] = in.readLong();
}
FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1); FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
return new FuzzySet(bits, bloomSize, hashCount); return new FuzzySet(bits, bloomSize, hashCount);
} }

View File

@ -101,9 +101,7 @@ public final class Lucene90LiveDocsFormat extends LiveDocsFormat {
private FixedBitSet readFixedBitSet(IndexInput input, int length) throws IOException { private FixedBitSet readFixedBitSet(IndexInput input, int length) throws IOException {
long[] data = new long[FixedBitSet.bits2words(length)]; long[] data = new long[FixedBitSet.bits2words(length)];
for (int i = 0; i < data.length; i++) { input.readLongs(data, 0, data.length);
data[i] = input.readLong();
}
return new FixedBitSet(data, length); return new FixedBitSet(data, length);
} }

View File

@ -16,7 +16,11 @@
*/ */
package org.apache.lucene.store; package org.apache.lucene.store;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.LongBuffer;
import java.util.zip.Checksum; import java.util.zip.Checksum;
import org.apache.lucene.util.BitUtil;
/** Wraps another {@link Checksum} with an internal buffer to speed up checksum calculations. */ /** Wraps another {@link Checksum} with an internal buffer to speed up checksum calculations. */
public class BufferedChecksum implements Checksum { public class BufferedChecksum implements Checksum {
@ -60,6 +64,45 @@ public class BufferedChecksum implements Checksum {
} }
} }
void updateShort(short val) {
if (upto + Short.BYTES > buffer.length) flush();
BitUtil.VH_LE_SHORT.set(buffer, upto, val);
upto += Short.BYTES;
}
void updateInt(int val) {
if (upto + Integer.BYTES > buffer.length) flush();
BitUtil.VH_LE_INT.set(buffer, upto, val);
upto += Integer.BYTES;
}
void updateLong(long val) {
if (upto + Long.BYTES > buffer.length) flush();
BitUtil.VH_LE_LONG.set(buffer, upto, val);
upto += Long.BYTES;
}
void updateLongs(long[] vals, int offset, int len) {
if (upto > 0) {
int remainingCapacityInLong = Math.min((buffer.length - upto) / Long.BYTES, len);
for (int i = 0; i < remainingCapacityInLong; i++, offset++, len--) {
updateLong(vals[offset]);
}
if (0 == len) return;
}
LongBuffer b = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
final int capacityInLong = buffer.length / Long.BYTES;
while (len > 0) {
flush();
int l = Math.min(capacityInLong, len);
b.put(0, vals, offset, l);
upto += l * Long.BYTES;
offset += l;
len -= l;
}
}
@Override @Override
public long getValue() { public long getValue() {
flush(); flush();

View File

@ -18,14 +18,13 @@ package org.apache.lucene.store;
import java.io.IOException; import java.io.IOException;
import java.util.zip.CRC32; import java.util.zip.CRC32;
import java.util.zip.Checksum;
/** /**
* Simple implementation of {@link ChecksumIndexInput} that wraps another input and delegates calls. * Simple implementation of {@link ChecksumIndexInput} that wraps another input and delegates calls.
*/ */
public class BufferedChecksumIndexInput extends ChecksumIndexInput { public class BufferedChecksumIndexInput extends ChecksumIndexInput {
final IndexInput main; final IndexInput main;
final Checksum digest; final BufferedChecksum digest;
/** Creates a new BufferedChecksumIndexInput */ /** Creates a new BufferedChecksumIndexInput */
public BufferedChecksumIndexInput(IndexInput main) { public BufferedChecksumIndexInput(IndexInput main) {
@ -47,6 +46,33 @@ public class BufferedChecksumIndexInput extends ChecksumIndexInput {
digest.update(b, offset, len); digest.update(b, offset, len);
} }
@Override
public short readShort() throws IOException {
short v = main.readShort();
digest.updateShort(v);
return v;
}
@Override
public int readInt() throws IOException {
int v = main.readInt();
digest.updateInt(v);
return v;
}
@Override
public long readLong() throws IOException {
long v = main.readLong();
digest.updateLong(v);
return v;
}
@Override
public void readLongs(long[] dst, int offset, int length) throws IOException {
main.readLongs(dst, offset, length);
digest.updateLongs(dst, offset, length);
}
@Override @Override
public long getChecksum() { public long getChecksum() {
return digest.getValue(); return digest.getValue();

View File

@ -16,9 +16,13 @@
*/ */
package org.apache.lucene.store; package org.apache.lucene.store;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.LongBuffer;
import java.util.zip.CRC32; import java.util.zip.CRC32;
import java.util.zip.Checksum; import java.util.zip.Checksum;
import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BitUtil;
public class TestBufferedChecksum extends LuceneTestCase { public class TestBufferedChecksum extends LuceneTestCase {
@ -63,4 +67,135 @@ public class TestBufferedChecksum extends LuceneTestCase {
} }
assertEquals(c1.getValue(), c2.getValue()); assertEquals(c1.getValue(), c2.getValue());
} }
public void testDifferentInputTypes() {
Checksum crc = new CRC32();
BufferedChecksum buffered = new BufferedChecksum(new CRC32());
int iterations = atLeast(1000);
for (int i = 0; i < iterations; i++) {
byte[] input = new byte[4096];
random().nextBytes(input);
crc.update(input);
final long checksum = crc.getValue();
crc.reset();
updateByShorts(checksum, buffered, input);
updateByInts(checksum, buffered, input);
updateByLongs(checksum, buffered, input);
updateByChunkOfBytes(checksum, buffered, input);
updateByChunkOfLongs(checksum, buffered, input);
}
}
private void updateByChunkOfBytes(long expected, BufferedChecksum checksum, byte[] input) {
for (int i = 0; i < input.length; i++) {
checksum.update(input[i]);
}
checkChecksumValueAndReset(expected, checksum);
checksum.update(input);
checkChecksumValueAndReset(expected, checksum);
int iterations = atLeast(10);
for (int ite = 0; ite < iterations; ite++) {
int len0 = random().nextInt(input.length / 2);
checksum.update(input, 0, len0);
checksum.update(input, len0, input.length - len0);
checkChecksumValueAndReset(expected, checksum);
checksum.update(input, 0, len0);
int len1 = random().nextInt(input.length / 4);
for (int i = 0; i < len1; i++) {
checksum.update(input[len0 + i]);
}
checksum.update(input, len0 + len1, input.length - len1 - len0);
checkChecksumValueAndReset(expected, checksum);
}
}
private void updateByShorts(long expected, BufferedChecksum checksum, byte[] input) {
int ix = shiftArray(checksum, input);
while (ix <= input.length - Short.BYTES) {
checksum.updateShort((short) BitUtil.VH_LE_SHORT.get(input, ix));
ix += Short.BYTES;
}
checksum.update(input, ix, input.length - ix);
checkChecksumValueAndReset(expected, checksum);
}
private void updateByInts(long expected, BufferedChecksum checksum, byte[] input) {
int ix = shiftArray(checksum, input);
while (ix <= input.length - Integer.BYTES) {
checksum.updateInt((int) BitUtil.VH_LE_INT.get(input, ix));
ix += Integer.BYTES;
}
checksum.update(input, ix, input.length - ix);
checkChecksumValueAndReset(expected, checksum);
}
private void updateByLongs(long expected, BufferedChecksum checksum, byte[] input) {
int ix = shiftArray(checksum, input);
while (ix <= input.length - Long.BYTES) {
checksum.updateLong((long) BitUtil.VH_LE_LONG.get(input, ix));
ix += Long.BYTES;
}
checksum.update(input, ix, input.length - ix);
checkChecksumValueAndReset(expected, checksum);
}
private static int shiftArray(BufferedChecksum checksum, byte[] input) {
int ix = random().nextInt(input.length / 4);
checksum.update(input, 0, ix);
return ix;
}
private void updateByChunkOfLongs(long expected, BufferedChecksum checksum, byte[] input) {
int ix = random().nextInt(input.length / 4);
int remaining = Long.BYTES - ix & 7;
LongBuffer b =
ByteBuffer.wrap(input).position(ix).order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
long[] longInput = new long[(input.length - ix) / Long.BYTES];
b.get(longInput);
checksum.update(input, 0, ix);
for (int i = 0; i < longInput.length; i++) {
checksum.updateLong(longInput[i]);
}
checksum.update(input, input.length - remaining, remaining);
checkChecksumValueAndReset(expected, checksum);
checksum.update(input, 0, ix);
checksum.updateLongs(longInput, 0, longInput.length);
checksum.update(input, input.length - remaining, remaining);
checkChecksumValueAndReset(expected, checksum);
int iterations = atLeast(10);
for (int ite = 0; ite < iterations; ite++) {
int len0 = random().nextInt(longInput.length / 2);
checksum.update(input, 0, ix);
checksum.updateLongs(longInput, 0, len0);
checksum.updateLongs(longInput, len0, longInput.length - len0);
checksum.update(input, input.length - remaining, remaining);
checkChecksumValueAndReset(expected, checksum);
checksum.update(input, 0, ix);
checksum.updateLongs(longInput, 0, len0);
int len1 = random().nextInt(longInput.length / 4);
for (int i = 0; i < len1; i++) {
checksum.updateLong(longInput[len0 + i]);
}
checksum.updateLongs(longInput, len0 + len1, longInput.length - len1 - len0);
checksum.update(input, input.length - remaining, remaining);
checkChecksumValueAndReset(expected, checksum);
checksum.update(input, 0, ix);
checksum.updateLongs(longInput, 0, len0);
checksum.update(input, ix + len0 * Long.BYTES, input.length - len0 * Long.BYTES - ix);
checkChecksumValueAndReset(expected, checksum);
}
}
private void checkChecksumValueAndReset(long expected, Checksum checksum) {
assertEquals(expected, checksum.getValue());
checksum.reset();
}
} }