diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b123ebfc93b..4f5326ba318 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -49,6 +49,10 @@ Bug Fixes trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one term. (Thomas Kappler via David Smiley) +* LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an + ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs) + with large skips. (yonik) + Improvements Optimizations diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java index 72ffe9f1a9d..c937886c0ae 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java @@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable { /** skipInterval of each level. */ private int skipInterval[]; - /** Number of docs skipped per level. */ + /** Number of docs skipped per level. + * It's possible for some values to overflow a signed int, but this has been accounted for. + */ private int[] numSkipped; /** Doc id of current skip entry per level. */ @@ -150,8 +152,9 @@ public abstract class MultiLevelSkipListReader implements Closeable { setLastSkipData(level); numSkipped[level] += skipInterval[level]; - - if (numSkipped[level] > docCount) { + + // numSkipped may overflow a signed int, so compare as unsigned. + if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) { // this skip list is exhausted skipDoc[level] = Integer.MAX_VALUE; if (numberOfSkipLevels > level) numberOfSkipLevels = level; diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java new file mode 100644 index 00000000000..4fab45a1a47 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.store.BaseDirectoryWrapper; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.Monster; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.TimeUnits; + +@SuppressCodecs({"SimpleText", "Memory", "Direct"}) +@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit +@Monster("Takes ~30min") +@SuppressSysoutChecks(bugUrl = "Stuff gets printed") +public class Test2BDocs extends LuceneTestCase { + + // indexes Integer.MAX_VALUE docs with indexed field(s) + public void test2BDocs() throws Exception { + BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs")); + if (dir instanceof MockDirectoryWrapper) { + ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER); + } + + IndexWriter w = new IndexWriter(dir, + new IndexWriterConfig(new MockAnalyzer(random())) + .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) + .setRAMBufferSizeMB(256.0) + .setMergeScheduler(new ConcurrentMergeScheduler()) + .setMergePolicy(newLogMergePolicy(false, 10)) + .setOpenMode(IndexWriterConfig.OpenMode.CREATE) + .setCodec(TestUtil.getDefaultCodec())); + + Document doc = new Document(); + Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED); + doc.add(field); + + for (int i = 0; i < IndexWriter.MAX_DOCS; i++) { + w.addDocument(doc); + if (i % (10*1000*1000) == 0) { + System.out.println("indexed: " + i); + System.out.flush(); + } + } + + w.forceMerge(1); + w.close(); + + System.out.println("verifying..."); + System.out.flush(); + + DirectoryReader r = DirectoryReader.open(dir); + + BytesRef term = new BytesRef(1); + term.bytes[0] = (byte)'a'; + term.length = 1; + + long skips = 0; + + Random rnd = random(); + + long start = System.nanoTime(); + + for (LeafReaderContext context : r.leaves()) { + LeafReader reader = context.reader(); + int lim = context.reader().maxDoc(); + + Terms terms = reader.fields().terms("f1"); + for (int i=0; i<10000; i++) { + TermsEnum te = terms.iterator(); + assertTrue( te.seekExact(term) ); + PostingsEnum docs = te.postings(null); + + // skip randomly through the term + for (int target = -1;;) + { + int maxSkipSize = lim - target + 1; + // do a smaller skip half of the time + if (rnd.nextBoolean()) { + maxSkipSize = Math.min(256, maxSkipSize); + } + int newTarget = target + rnd.nextInt(maxSkipSize) + 1; + if (newTarget >= lim) { + if (target+1 >= lim) break; // we already skipped to end, so break. + newTarget = lim-1; // skip to end + } + target = newTarget; + + int res = docs.advance(target); + if (res == PostingsEnum.NO_MORE_DOCS) break; + + assertTrue( res >= target ); + + skips++; + target = res; + } + } + } + + r.close(); + dir.close(); + + long end = System.nanoTime(); + + System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start)); + assert skips > 0; + } + +}