mirror of https://github.com/apache/lucene.git
LUCENE-9480: Make DataInput.skipBytes(long) abstract
skipBytes() is a "relative" version of seek(), but DataInput previously implemented it via read() calls, because DataInput's API does not include absolute positioning methods (seek, getFilePointer). This resulted in inefficiencies: calls to skipBytes() would cause buffers to be allocated, bytes copied, etc. Instead, make the subclass implement skipBytes() explicitly. The old DataInput implementation is marked deprecated and renamed to skipBytesSlowly(). Some subclasses still implement skipBytes() via skipBytesSlowly(), to be fixed in future improvements.
This commit is contained in:
parent
2f0d191452
commit
c51fee9c1a
|
@ -89,6 +89,9 @@ API Changes
|
|||
|
||||
* LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant)
|
||||
|
||||
* LUCENE-9480: Make DataInput's skipBytes(long) abstract as the implementation was not performant.
|
||||
IndexInput's api is unaffected: skipBytes() is implemented via seek(). (Greg Miller)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
|
||||
|
|
|
@ -651,6 +651,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
|||
bytes.offset += len;
|
||||
bytes.length -= len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
skipBytesSlowly(numBytes);
|
||||
}
|
||||
};
|
||||
} else {
|
||||
fieldsStream.seek(startPointer);
|
||||
|
|
|
@ -138,4 +138,9 @@ final class ByteSliceReader extends DataInput {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
skipBytesSlowly(numBytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.store;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.nio.BufferUnderflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
@ -258,6 +259,11 @@ public final class ByteBuffersDataInput extends DataInput
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
skipBytesSlowly(numBytes);
|
||||
}
|
||||
|
||||
public ByteBuffersDataInput slice(long offset, long length) {
|
||||
if (offset < 0 || length < 0 || offset + length > this.size) {
|
||||
throw new IllegalArgumentException(
|
||||
|
|
|
@ -50,6 +50,10 @@ public abstract class ChecksumIndexInput extends IndexInput {
|
|||
throw new IllegalStateException(
|
||||
getClass() + " cannot seek backwards (pos=" + pos + " getFilePointer()=" + curFP + ")");
|
||||
}
|
||||
skipBytes(skip);
|
||||
// we must skip slowly to ensure skipped bytes are still read and used
|
||||
// to update checksums
|
||||
// TODO: this "slow skip" logic should be moved into this class once
|
||||
// no longer needed as default logic in DataInput
|
||||
skipBytesSlowly(skip);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -348,8 +348,12 @@ public abstract class DataInput implements Cloneable {
|
|||
* Skip over <code>numBytes</code> bytes. The contract on this method is that it should have the
|
||||
* same behavior as reading the same number of bytes into a buffer and discarding its content.
|
||||
* Negative values of <code>numBytes</code> are not supported.
|
||||
*
|
||||
* @deprecated Implementing subclasses should override #skipBytes with a more performant solution
|
||||
* where possible.
|
||||
*/
|
||||
public void skipBytes(final long numBytes) throws IOException {
|
||||
@Deprecated
|
||||
protected void skipBytesSlowly(final long numBytes) throws IOException {
|
||||
if (numBytes < 0) {
|
||||
throw new IllegalArgumentException("numBytes must be >= 0, got " + numBytes);
|
||||
}
|
||||
|
@ -363,4 +367,11 @@ public abstract class DataInput implements Cloneable {
|
|||
skipped += step;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over <code>numBytes</code> bytes. This method may skip bytes in whatever way is most
|
||||
* optimal, and may not have the same behavior as reading the skipped bytes. In general, negative
|
||||
* <code>numBytes</code> are not supported.
|
||||
*/
|
||||
public abstract void skipBytes(final long numBytes) throws IOException;
|
||||
}
|
||||
|
|
|
@ -72,6 +72,23 @@ public abstract class IndexInput extends DataInput implements Cloneable, Closeab
|
|||
*/
|
||||
public abstract void seek(long pos) throws IOException;
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* <p>Behavior is functionally equivalent to seeking to <code>getFilePointer() + numBytes</code>.
|
||||
*
|
||||
* @see #getFilePointer()
|
||||
* @see #seek(long)
|
||||
*/
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
if (numBytes < 0) {
|
||||
throw new IllegalArgumentException("numBytes must be >= 0, got " + numBytes);
|
||||
}
|
||||
final long skipTo = getFilePointer() + numBytes;
|
||||
seek(skipTo);
|
||||
}
|
||||
|
||||
/** The number of bytes in the file. */
|
||||
public abstract long length();
|
||||
|
||||
|
|
|
@ -50,4 +50,9 @@ public class InputStreamDataInput extends DataInput implements Closeable {
|
|||
public void close() throws IOException {
|
||||
is.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
skipBytesSlowly(numBytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -344,6 +344,11 @@ public final class PagedBytes implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
skipBytesSlowly(numBytes);
|
||||
}
|
||||
|
||||
private void nextBlock() {
|
||||
currentBlockIndex++;
|
||||
currentBlockUpto = 0;
|
||||
|
|
|
@ -263,6 +263,41 @@ public class TestIndexInput extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkSeeksAndSkips(IndexInput is, Random random) throws IOException {
|
||||
long len = is.length();
|
||||
|
||||
int iterations = LuceneTestCase.TEST_NIGHTLY ? 1_000 : 10;
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
is.seek(0); // make sure we're at the start
|
||||
|
||||
for (long curr = 0; curr < len; ) {
|
||||
long maxSkipTo = len - 1;
|
||||
// if we're close to the end, just skip all the way
|
||||
long skipTo = (len - curr < 10) ? maxSkipTo : TestUtil.nextLong(random, curr, maxSkipTo);
|
||||
long skipDelta = skipTo - curr;
|
||||
|
||||
// first reposition using seek
|
||||
byte startByte1 = is.readByte();
|
||||
is.seek(skipTo);
|
||||
byte endByte1 = is.readByte();
|
||||
|
||||
// do the same thing but with skipBytes
|
||||
is.seek(curr);
|
||||
byte startByte2 = is.readByte();
|
||||
is.seek(curr);
|
||||
is.skipBytes(skipDelta);
|
||||
byte endByte2 = is.readByte();
|
||||
|
||||
assertEquals(startByte1, startByte2);
|
||||
assertEquals(endByte1, endByte2);
|
||||
// +1 since we read the byte we seek/skip to
|
||||
assertEquals(curr + skipDelta + 1, is.getFilePointer());
|
||||
|
||||
curr = is.getFilePointer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this test checks the IndexInput methods of any impl
|
||||
public void testRawIndexInputRead() throws IOException {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
|
@ -273,6 +308,7 @@ public class TestIndexInput extends LuceneTestCase {
|
|||
os.close();
|
||||
IndexInput is = dir.openInput("foo", newIOContext(random));
|
||||
checkReads(is, IOException.class);
|
||||
checkSeeksAndSkips(is, random);
|
||||
is.close();
|
||||
|
||||
os = dir.createOutput("bar", newIOContext(random));
|
||||
|
@ -280,6 +316,7 @@ public class TestIndexInput extends LuceneTestCase {
|
|||
os.close();
|
||||
is = dir.openInput("bar", newIOContext(random));
|
||||
checkRandomReads(is);
|
||||
checkSeeksAndSkips(is, random);
|
||||
is.close();
|
||||
dir.close();
|
||||
}
|
||||
|
@ -291,4 +328,66 @@ public class TestIndexInput extends LuceneTestCase {
|
|||
is = new ByteArrayDataInput(RANDOM_TEST_BYTES);
|
||||
checkRandomReads(is);
|
||||
}
|
||||
|
||||
public void testNoReadOnSkipBytes() throws IOException {
|
||||
long len = LuceneTestCase.TEST_NIGHTLY ? Long.MAX_VALUE : 1_000_000;
|
||||
long maxSeekPos = len - 1;
|
||||
InterceptingIndexInput is = new InterceptingIndexInput("foo", len);
|
||||
|
||||
while (is.pos < maxSeekPos) {
|
||||
long seekPos = TestUtil.nextLong(random(), is.pos, maxSeekPos);
|
||||
long skipDelta = seekPos - is.pos;
|
||||
is.skipBytes(skipDelta);
|
||||
assertEquals(seekPos, is.pos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mock IndexInput that just tracks a position (which responds to seek/skip) and throws if
|
||||
* #readByte or #readBytes are called, ensuring seek/skip doesn't invoke reads.
|
||||
*/
|
||||
private static final class InterceptingIndexInput extends IndexInput {
|
||||
long pos = 0;
|
||||
final long len;
|
||||
|
||||
protected InterceptingIndexInput(String resourceDescription, long len) {
|
||||
super(resourceDescription);
|
||||
this.len = len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getFilePointer() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long length() {
|
||||
return len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// no-op
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexInput slice(String sliceDescription, long offset, long length) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.store;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestChecksumIndexInput extends LuceneTestCase {
|
||||
|
||||
public void testSkipBytes() throws IOException {
|
||||
int numTestBytes = TestUtil.nextInt(random(), 100, 1000);
|
||||
byte[] testBytes = new byte[numTestBytes];
|
||||
final Directory dir = newDirectory();
|
||||
IndexOutput os = dir.createOutput("foo", newIOContext(random()));
|
||||
os.writeBytes(testBytes, numTestBytes);
|
||||
os.close();
|
||||
|
||||
IndexInput is = dir.openInput("foo", newIOContext(random()));
|
||||
final InterceptingChecksumIndexInput checksumIndexInput =
|
||||
new InterceptingChecksumIndexInput(is, numTestBytes);
|
||||
|
||||
// skip random chunks of bytes until everything has been skipped
|
||||
for (int skipped = 0; skipped < numTestBytes; ) {
|
||||
final int remaining = numTestBytes - skipped;
|
||||
// when remaining gets small enough, just skip the rest
|
||||
final int step = remaining < 10 ? remaining : random().nextInt(remaining);
|
||||
checksumIndexInput.skipBytes(step);
|
||||
skipped += step;
|
||||
}
|
||||
|
||||
// ensure all skipped bytes are still "read" in so the checksum can be updated properly
|
||||
assertArrayEquals(testBytes, checksumIndexInput.readBytes);
|
||||
|
||||
is.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures read bytes into a separate buffer for confirming that all #skipByte invocations
|
||||
* delegate to #readBytes.
|
||||
*/
|
||||
private static final class InterceptingChecksumIndexInput extends BufferedChecksumIndexInput {
|
||||
final byte[] readBytes;
|
||||
private int off = 0;
|
||||
|
||||
public InterceptingChecksumIndexInput(IndexInput main, int len) {
|
||||
super(main);
|
||||
readBytes = new byte[len];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(final byte[] b, final int offset, final int len) throws IOException {
|
||||
super.readBytes(b, offset, len);
|
||||
System.arraycopy(b, offset, readBytes, off, len);
|
||||
off += len;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue