From c51fee9c1a59030bda61b600cca8923410f1e090 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 20 Feb 2021 12:11:32 -0500 Subject: [PATCH] LUCENE-9480: Make DataInput.skipBytes(long) abstract skipBytes() is a "relative" version of seek(), but DataInput previously implemented it via read() calls, because DataInput's API does not include absolute positioning methods (seek, getFilePointer). This resulted in inefficiencies: calls to skipBytes() would cause buffers to be allocated, bytes copied, etc. Instead, make the subclass implement skipBytes() explicitly. The old DataInput implementation is marked deprecated and renamed to skipBytesSlowly(). Some subclasses still implement skipBytes() via skipBytesSlowly(), to be fixed in future improvements. --- lucene/CHANGES.txt | 3 + .../CompressingStoredFieldsReader.java | 5 + .../apache/lucene/index/ByteSliceReader.java | 5 + .../lucene/store/ByteBuffersDataInput.java | 6 ++ .../lucene/store/ChecksumIndexInput.java | 6 +- .../org/apache/lucene/store/DataInput.java | 13 ++- .../org/apache/lucene/store/IndexInput.java | 17 ++++ .../lucene/store/InputStreamDataInput.java | 5 + .../org/apache/lucene/util/PagedBytes.java | 5 + .../apache/lucene/index/TestIndexInput.java | 99 +++++++++++++++++++ .../lucene/store/TestChecksumIndexInput.java | 73 ++++++++++++++ 11 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/store/TestChecksumIndexInput.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9607b36986a..f5a41ac4f5e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -89,6 +89,9 @@ API Changes * LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant) +* LUCENE-9480: Make DataInput's skipBytes(long) abstract as the implementation was not performant. + IndexInput's api is unaffected: skipBytes() is implemented via seek(). (Greg Miller) + Improvements * LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index 850a023db40..f169e527542 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -651,6 +651,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { bytes.offset += len; bytes.length -= len; } + + @Override + public void skipBytes(long numBytes) throws IOException { + skipBytesSlowly(numBytes); + } }; } else { fieldsStream.seek(startPointer); diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteSliceReader.java b/lucene/core/src/java/org/apache/lucene/index/ByteSliceReader.java index 9a7224be7fb..951d2c66308 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteSliceReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteSliceReader.java @@ -138,4 +138,9 @@ final class ByteSliceReader extends DataInput { } } } + + @Override + public void skipBytes(long numBytes) throws IOException { + skipBytesSlowly(numBytes); + } } diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java index 5bf6b90f5a0..57cfd5b560f 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java @@ -17,6 +17,7 @@ package org.apache.lucene.store; import java.io.EOFException; +import java.io.IOException; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -258,6 +259,11 @@ public final class ByteBuffersDataInput extends DataInput } } + @Override + public void skipBytes(long numBytes) throws IOException { + skipBytesSlowly(numBytes); + } + public ByteBuffersDataInput slice(long offset, long length) { if (offset < 0 || length < 0 || offset + length > this.size) { throw new IllegalArgumentException( diff --git a/lucene/core/src/java/org/apache/lucene/store/ChecksumIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/ChecksumIndexInput.java index 158280d4b01..04a21f07a06 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ChecksumIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ChecksumIndexInput.java @@ -50,6 +50,10 @@ public abstract class ChecksumIndexInput extends IndexInput { throw new IllegalStateException( getClass() + " cannot seek backwards (pos=" + pos + " getFilePointer()=" + curFP + ")"); } - skipBytes(skip); + // we must skip slowly to ensure skipped bytes are still read and used + // to update checksums + // TODO: this "slow skip" logic should be moved into this class once + // no longer needed as default logic in DataInput + skipBytesSlowly(skip); } } diff --git a/lucene/core/src/java/org/apache/lucene/store/DataInput.java b/lucene/core/src/java/org/apache/lucene/store/DataInput.java index dc3cf25c0eb..11776195728 100644 --- a/lucene/core/src/java/org/apache/lucene/store/DataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/DataInput.java @@ -348,8 +348,12 @@ public abstract class DataInput implements Cloneable { * Skip over numBytes bytes. The contract on this method is that it should have the * same behavior as reading the same number of bytes into a buffer and discarding its content. * Negative values of numBytes are not supported. + * + * @deprecated Implementing subclasses should override #skipBytes with a more performant solution + * where possible. */ - public void skipBytes(final long numBytes) throws IOException { + @Deprecated + protected void skipBytesSlowly(final long numBytes) throws IOException { if (numBytes < 0) { throw new IllegalArgumentException("numBytes must be >= 0, got " + numBytes); } @@ -363,4 +367,11 @@ public abstract class DataInput implements Cloneable { skipped += step; } } + + /** + * Skip over numBytes bytes. This method may skip bytes in whatever way is most + * optimal, and may not have the same behavior as reading the skipped bytes. In general, negative + * numBytes are not supported. + */ + public abstract void skipBytes(final long numBytes) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java index 649f932dfc8..10f4a375f2c 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java @@ -72,6 +72,23 @@ public abstract class IndexInput extends DataInput implements Cloneable, Closeab */ public abstract void seek(long pos) throws IOException; + /** + * {@inheritDoc} + * + *

Behavior is functionally equivalent to seeking to getFilePointer() + numBytes. + * + * @see #getFilePointer() + * @see #seek(long) + */ + @Override + public void skipBytes(long numBytes) throws IOException { + if (numBytes < 0) { + throw new IllegalArgumentException("numBytes must be >= 0, got " + numBytes); + } + final long skipTo = getFilePointer() + numBytes; + seek(skipTo); + } + /** The number of bytes in the file. */ public abstract long length(); diff --git a/lucene/core/src/java/org/apache/lucene/store/InputStreamDataInput.java b/lucene/core/src/java/org/apache/lucene/store/InputStreamDataInput.java index 2133c502426..f86185ae79b 100644 --- a/lucene/core/src/java/org/apache/lucene/store/InputStreamDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/InputStreamDataInput.java @@ -50,4 +50,9 @@ public class InputStreamDataInput extends DataInput implements Closeable { public void close() throws IOException { is.close(); } + + @Override + public void skipBytes(long numBytes) throws IOException { + skipBytesSlowly(numBytes); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java index 4c95324a129..6916ef14834 100644 --- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java +++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java @@ -344,6 +344,11 @@ public final class PagedBytes implements Accountable { } } + @Override + public void skipBytes(long numBytes) throws IOException { + skipBytesSlowly(numBytes); + } + private void nextBlock() { currentBlockIndex++; currentBlockUpto = 0; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexInput.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexInput.java index 304de6d64b3..67031a0625d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexInput.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexInput.java @@ -263,6 +263,41 @@ public class TestIndexInput extends LuceneTestCase { } } + private void checkSeeksAndSkips(IndexInput is, Random random) throws IOException { + long len = is.length(); + + int iterations = LuceneTestCase.TEST_NIGHTLY ? 1_000 : 10; + for (int i = 0; i < iterations; i++) { + is.seek(0); // make sure we're at the start + + for (long curr = 0; curr < len; ) { + long maxSkipTo = len - 1; + // if we're close to the end, just skip all the way + long skipTo = (len - curr < 10) ? maxSkipTo : TestUtil.nextLong(random, curr, maxSkipTo); + long skipDelta = skipTo - curr; + + // first reposition using seek + byte startByte1 = is.readByte(); + is.seek(skipTo); + byte endByte1 = is.readByte(); + + // do the same thing but with skipBytes + is.seek(curr); + byte startByte2 = is.readByte(); + is.seek(curr); + is.skipBytes(skipDelta); + byte endByte2 = is.readByte(); + + assertEquals(startByte1, startByte2); + assertEquals(endByte1, endByte2); + // +1 since we read the byte we seek/skip to + assertEquals(curr + skipDelta + 1, is.getFilePointer()); + + curr = is.getFilePointer(); + } + } + } + // this test checks the IndexInput methods of any impl public void testRawIndexInputRead() throws IOException { for (int i = 0; i < 10; i++) { @@ -273,6 +308,7 @@ public class TestIndexInput extends LuceneTestCase { os.close(); IndexInput is = dir.openInput("foo", newIOContext(random)); checkReads(is, IOException.class); + checkSeeksAndSkips(is, random); is.close(); os = dir.createOutput("bar", newIOContext(random)); @@ -280,6 +316,7 @@ public class TestIndexInput extends LuceneTestCase { os.close(); is = dir.openInput("bar", newIOContext(random)); checkRandomReads(is); + checkSeeksAndSkips(is, random); is.close(); dir.close(); } @@ -291,4 +328,66 @@ public class TestIndexInput extends LuceneTestCase { is = new ByteArrayDataInput(RANDOM_TEST_BYTES); checkRandomReads(is); } + + public void testNoReadOnSkipBytes() throws IOException { + long len = LuceneTestCase.TEST_NIGHTLY ? Long.MAX_VALUE : 1_000_000; + long maxSeekPos = len - 1; + InterceptingIndexInput is = new InterceptingIndexInput("foo", len); + + while (is.pos < maxSeekPos) { + long seekPos = TestUtil.nextLong(random(), is.pos, maxSeekPos); + long skipDelta = seekPos - is.pos; + is.skipBytes(skipDelta); + assertEquals(seekPos, is.pos); + } + } + + /** + * Mock IndexInput that just tracks a position (which responds to seek/skip) and throws if + * #readByte or #readBytes are called, ensuring seek/skip doesn't invoke reads. + */ + private static final class InterceptingIndexInput extends IndexInput { + long pos = 0; + final long len; + + protected InterceptingIndexInput(String resourceDescription, long len) { + super(resourceDescription); + this.len = len; + } + + @Override + public void seek(long pos) { + this.pos = pos; + } + + @Override + public long getFilePointer() { + return pos; + } + + @Override + public long length() { + return len; + } + + @Override + public byte readByte() { + throw new UnsupportedOperationException(); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + // no-op + } + + @Override + public IndexInput slice(String sliceDescription, long offset, long length) { + throw new UnsupportedOperationException(); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestChecksumIndexInput.java b/lucene/core/src/test/org/apache/lucene/store/TestChecksumIndexInput.java new file mode 100644 index 00000000000..05abce0f9a6 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/store/TestChecksumIndexInput.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.store; + +import java.io.IOException; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestChecksumIndexInput extends LuceneTestCase { + + public void testSkipBytes() throws IOException { + int numTestBytes = TestUtil.nextInt(random(), 100, 1000); + byte[] testBytes = new byte[numTestBytes]; + final Directory dir = newDirectory(); + IndexOutput os = dir.createOutput("foo", newIOContext(random())); + os.writeBytes(testBytes, numTestBytes); + os.close(); + + IndexInput is = dir.openInput("foo", newIOContext(random())); + final InterceptingChecksumIndexInput checksumIndexInput = + new InterceptingChecksumIndexInput(is, numTestBytes); + + // skip random chunks of bytes until everything has been skipped + for (int skipped = 0; skipped < numTestBytes; ) { + final int remaining = numTestBytes - skipped; + // when remaining gets small enough, just skip the rest + final int step = remaining < 10 ? remaining : random().nextInt(remaining); + checksumIndexInput.skipBytes(step); + skipped += step; + } + + // ensure all skipped bytes are still "read" in so the checksum can be updated properly + assertArrayEquals(testBytes, checksumIndexInput.readBytes); + + is.close(); + dir.close(); + } + + /** + * Captures read bytes into a separate buffer for confirming that all #skipByte invocations + * delegate to #readBytes. + */ + private static final class InterceptingChecksumIndexInput extends BufferedChecksumIndexInput { + final byte[] readBytes; + private int off = 0; + + public InterceptingChecksumIndexInput(IndexInput main, int len) { + super(main); + readBytes = new byte[len]; + } + + @Override + public void readBytes(final byte[] b, final int offset, final int len) throws IOException { + super.readBytes(b, offset, len); + System.arraycopy(b, offset, readBytes, off, len); + off += len; + } + } +}