mirror of https://github.com/apache/lucene.git
Add IndexInput#prefetch. (#13337)
This adds `IndexInput#prefetch`, which is an optional operation that instructs the `IndexInput` to start fetching bytes from storage in the background. These bytes will be picked up by follow-up calls to the `IndexInput#readXXX` methods. In the future, this will help Lucene move from a maximum of one I/O operation per search thread to one I/O operation per search thread per `IndexInput`. Typically, when running a query on two terms, the I/O into the terms dictionary is sequential today. In the future, we would ideally do these I/Os in parallel using this new API. Note that this will require API changes to some classes including `TermsEnum`. I settled on this API because it's simple and wouldn't require making all Lucene APIs asynchronous to take advantage of extra I/O concurrency, which I worry would make the query evaluation logic too complicated. This change will require follow-ups to start using this new API when working with terms dictionaries, postings, etc. Relates #13179 Co-authored-by: Uwe Schindler <uschindler@apache.org>
This commit is contained in:
parent
5ac88c750e
commit
3003731aa9
|
@ -102,6 +102,10 @@ API Changes
|
|||
Additionally, deprecated methods have been removed from ByteBuffersIndexInput, BooleanQuery and others. Please refer
|
||||
to MIGRATE.md for further details. (Sanjay Dutt)
|
||||
|
||||
* GITHUB#13337: Introduce new `IndexInput#prefetch(long)` API to give a hint to
|
||||
the directory about bytes that are about to be read. (Adrien Grand, Uwe
|
||||
Schindler)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -191,4 +191,15 @@ public abstract class IndexInput extends DataInput implements Closeable {
|
|||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Optional method: Give a hint to this input that some bytes will be read in the near future.
|
||||
* IndexInput implementations may take advantage of this hint to start fetching pages of data
|
||||
* immediately from storage.
|
||||
*
|
||||
* <p>The default implementation is a no-op.
|
||||
*
|
||||
* @param length the number of bytes to prefetch
|
||||
*/
|
||||
public void prefetch(long length) throws IOException {}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.lang.foreign.ValueLayout;
|
|||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.GroupVIntUtil;
|
||||
|
||||
|
@ -44,6 +45,7 @@ abstract class MemorySegmentIndexInput extends IndexInput implements RandomAcces
|
|||
ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
|
||||
static final ValueLayout.OfFloat LAYOUT_LE_FLOAT =
|
||||
ValueLayout.JAVA_FLOAT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
|
||||
private static final Optional<NativeAccess> NATIVE_ACCESS = NativeAccess.getImplementation();
|
||||
|
||||
final long length;
|
||||
final long chunkSizeMask;
|
||||
|
@ -310,6 +312,49 @@ abstract class MemorySegmentIndexInput extends IndexInput implements RandomAcces
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prefetch(long length) throws IOException {
|
||||
ensureOpen();
|
||||
|
||||
Objects.checkFromIndexSize(getFilePointer(), length, length());
|
||||
|
||||
if (NATIVE_ACCESS.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
final NativeAccess nativeAccess = NATIVE_ACCESS.get();
|
||||
|
||||
// If at the boundary between two chunks, move to the next one.
|
||||
seek(getFilePointer());
|
||||
try {
|
||||
// Compute the intersection of the current segment and the region that should be prefetched.
|
||||
long offset = curPosition;
|
||||
if (offset + length > curSegment.byteSize()) {
|
||||
// Only prefetch bytes that are stored in the current segment. There may be bytes on the
|
||||
// next segment but this case is rare enough that we don't try to optimize it and keep
|
||||
// things simple instead.
|
||||
length = curSegment.byteSize() - curPosition;
|
||||
}
|
||||
// Now align offset with the page size, this is required for madvise.
|
||||
// Compute the offset of the current position in the OS's page.
|
||||
final long offsetInPage = (curSegment.address() + offset) % nativeAccess.getPageSize();
|
||||
offset -= offsetInPage;
|
||||
length += offsetInPage;
|
||||
if (offset < 0) {
|
||||
// The start of the page is outside of this segment, ignore.
|
||||
return;
|
||||
}
|
||||
|
||||
final MemorySegment prefetchSlice = curSegment.asSlice(offset, length);
|
||||
nativeAccess.madviseWillNeed(prefetchSlice);
|
||||
} catch (
|
||||
@SuppressWarnings("unused")
|
||||
IndexOutOfBoundsException e) {
|
||||
throw new EOFException("Read past EOF: " + this);
|
||||
} catch (NullPointerException | IllegalStateException e) {
|
||||
throw alreadyClosed(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte(long pos) throws IOException {
|
||||
try {
|
||||
|
|
|
@ -111,10 +111,11 @@ final class MemorySegmentIndexInputProvider implements MMapDirectory.MMapIndexIn
|
|||
throw convertMapFailedIOException(ioe, resourceDescription, segSize);
|
||||
}
|
||||
// if preload apply it without madvise.
|
||||
// if chunk size is too small (2 MiB), disable madvise support (incorrect alignment)
|
||||
// skip madvise if the address of our segment is not page-aligned (small segments due to
|
||||
// internal FileChannel logic)
|
||||
if (preload) {
|
||||
segment.load();
|
||||
} else if (nativeAccess.isPresent() && chunkSizePower >= 21) {
|
||||
} else if (nativeAccess.filter(na -> segment.address() % na.getPageSize() == 0).isPresent()) {
|
||||
nativeAccess.get().madvise(segment, readAdvice);
|
||||
}
|
||||
segments[segNr] = segment;
|
||||
|
|
|
@ -27,6 +27,14 @@ abstract class NativeAccess {
|
|||
/** Invoke the {@code madvise} call for the given {@link MemorySegment}. */
|
||||
public abstract void madvise(MemorySegment segment, ReadAdvice readAdvice) throws IOException;
|
||||
|
||||
/**
|
||||
* Invoke the {@code madvise} call for the given {@link MemorySegment} with {@code MADV_WILLNEED}.
|
||||
*/
|
||||
public abstract void madviseWillNeed(MemorySegment segment) throws IOException;
|
||||
|
||||
/** Returns native page size. */
|
||||
public abstract int getPageSize();
|
||||
|
||||
/**
|
||||
* Return the NativeAccess instance for this platform. At moment we only support Linux and MacOS
|
||||
*/
|
||||
|
|
|
@ -50,6 +50,7 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
public static final int POSIX_MADV_DONTNEED = 4;
|
||||
|
||||
private static final MethodHandle MH$posix_madvise;
|
||||
private static final int PAGE_SIZE;
|
||||
|
||||
private static final Optional<NativeAccess> INSTANCE;
|
||||
|
||||
|
@ -60,10 +61,14 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
}
|
||||
|
||||
static {
|
||||
final Linker linker = Linker.nativeLinker();
|
||||
final SymbolLookup stdlib = linker.defaultLookup();
|
||||
MethodHandle adviseHandle = null;
|
||||
int pagesize = -1;
|
||||
PosixNativeAccess instance = null;
|
||||
try {
|
||||
adviseHandle = lookupMadvise();
|
||||
adviseHandle = lookupMadvise(linker, stdlib);
|
||||
pagesize = (int) lookupGetPageSize(linker, stdlib).invokeExact();
|
||||
instance = new PosixNativeAccess();
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
LOG.warning(uoe.getMessage());
|
||||
|
@ -77,14 +82,17 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
+ "pass the following on command line: --enable-native-access=%s",
|
||||
Optional.ofNullable(PosixNativeAccess.class.getModule().getName())
|
||||
.orElse("ALL-UNNAMED")));
|
||||
} catch (RuntimeException | Error e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
MH$posix_madvise = adviseHandle;
|
||||
PAGE_SIZE = pagesize;
|
||||
INSTANCE = Optional.ofNullable(instance);
|
||||
}
|
||||
|
||||
private static MethodHandle lookupMadvise() {
|
||||
final Linker linker = Linker.nativeLinker();
|
||||
final SymbolLookup stdlib = linker.defaultLookup();
|
||||
private static MethodHandle lookupMadvise(Linker linker, SymbolLookup stdlib) {
|
||||
return findFunction(
|
||||
linker,
|
||||
stdlib,
|
||||
|
@ -96,6 +104,10 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
ValueLayout.JAVA_INT));
|
||||
}
|
||||
|
||||
private static MethodHandle lookupGetPageSize(Linker linker, SymbolLookup stdlib) {
|
||||
return findFunction(linker, stdlib, "getpagesize", FunctionDescriptor.of(ValueLayout.JAVA_INT));
|
||||
}
|
||||
|
||||
private static MethodHandle findFunction(
|
||||
Linker linker, SymbolLookup lookup, String name, FunctionDescriptor desc) {
|
||||
final MemorySegment symbol =
|
||||
|
@ -110,17 +122,26 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
|
||||
@Override
|
||||
public void madvise(MemorySegment segment, ReadAdvice readAdvice) throws IOException {
|
||||
// Note: madvise is bypassed if the segment should be preloaded via MemorySegment#load.
|
||||
if (segment.byteSize() == 0L) {
|
||||
return; // empty segments should be excluded, because they may have no address at all
|
||||
}
|
||||
final Integer advice = mapReadAdvice(readAdvice);
|
||||
if (advice == null) {
|
||||
return; // do nothing
|
||||
}
|
||||
madvise(segment, advice);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void madviseWillNeed(MemorySegment segment) throws IOException {
|
||||
madvise(segment, POSIX_MADV_WILLNEED);
|
||||
}
|
||||
|
||||
private void madvise(MemorySegment segment, int advice) throws IOException {
|
||||
// Note: madvise is bypassed if the segment should be preloaded via MemorySegment#load.
|
||||
if (segment.byteSize() == 0L) {
|
||||
return; // empty segments should be excluded, because they may have no address at all
|
||||
}
|
||||
final int ret;
|
||||
try {
|
||||
ret = (int) MH$posix_madvise.invokeExact(segment, segment.byteSize(), advice.intValue());
|
||||
ret = (int) MH$posix_madvise.invokeExact(segment, segment.byteSize(), advice);
|
||||
} catch (Throwable th) {
|
||||
throw new AssertionError(th);
|
||||
}
|
||||
|
@ -143,4 +164,9 @@ final class PosixNativeAccess extends NativeAccess {
|
|||
case RANDOM_PRELOAD -> null;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPageSize() {
|
||||
return PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,6 +58,7 @@ import org.apache.lucene.tests.mockfile.ExtrasFS;
|
|||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.junit.Assert;
|
||||
|
@ -1512,4 +1513,65 @@ public abstract class BaseDirectoryTestCase extends LuceneTestCase {
|
|||
dir.deleteFile("group-varint");
|
||||
dir.deleteFile("vint");
|
||||
}
|
||||
|
||||
public void testPrefetch() throws IOException {
|
||||
doTestPrefetch(0);
|
||||
}
|
||||
|
||||
public void testPrefetchOnSlice() throws IOException {
|
||||
doTestPrefetch(TestUtil.nextInt(random(), 1, 1024));
|
||||
}
|
||||
|
||||
private void doTestPrefetch(int startOffset) throws IOException {
|
||||
try (Directory dir = getDirectory(createTempDir())) {
|
||||
final int totalLength = startOffset + TestUtil.nextInt(random(), 16384, 65536);
|
||||
byte[] arr = new byte[totalLength];
|
||||
random().nextBytes(arr);
|
||||
try (IndexOutput out = dir.createOutput("temp.bin", IOContext.DEFAULT)) {
|
||||
out.writeBytes(arr, arr.length);
|
||||
}
|
||||
byte[] temp = new byte[2048];
|
||||
|
||||
try (IndexInput orig = dir.openInput("temp.bin", IOContext.DEFAULT)) {
|
||||
IndexInput in;
|
||||
if (startOffset == 0) {
|
||||
in = orig.clone();
|
||||
} else {
|
||||
in = orig.slice("slice", startOffset, totalLength - startOffset);
|
||||
}
|
||||
for (int i = 0; i < 10_000; ++i) {
|
||||
final int startPointer = (int) in.getFilePointer();
|
||||
assertTrue(startPointer < in.length());
|
||||
if (random().nextBoolean()) {
|
||||
final long prefetchLength = TestUtil.nextLong(random(), 1, in.length() - startPointer);
|
||||
in.prefetch(prefetchLength);
|
||||
}
|
||||
assertEquals(startPointer, in.getFilePointer());
|
||||
switch (random().nextInt(100)) {
|
||||
case 0:
|
||||
assertEquals(arr[startOffset + startPointer], in.readByte());
|
||||
break;
|
||||
case 1:
|
||||
if (in.length() - startPointer >= Long.BYTES) {
|
||||
assertEquals(
|
||||
(long) BitUtil.VH_LE_LONG.get(arr, startOffset + startPointer), in.readLong());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
final int readLength =
|
||||
TestUtil.nextInt(
|
||||
random(), 1, (int) Math.min(temp.length, in.length() - startPointer));
|
||||
in.readBytes(temp, 0, readLength);
|
||||
assertArrayEquals(
|
||||
ArrayUtil.copyOfSubArray(
|
||||
arr, startOffset + startPointer, startOffset + startPointer + readLength),
|
||||
ArrayUtil.copyOfSubArray(temp, 0, readLength));
|
||||
}
|
||||
if (in.getFilePointer() == in.length() || random().nextBoolean()) {
|
||||
in.seek(TestUtil.nextInt(random(), 0, (int) in.length() - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -130,6 +130,12 @@ public class MockIndexInputWrapper extends FilterIndexInput {
|
|||
in.seek(pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prefetch(long length) throws IOException {
|
||||
ensureOpen();
|
||||
in.prefetch(length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long length() {
|
||||
ensureOpen();
|
||||
|
|
Loading…
Reference in New Issue