mirror of https://github.com/apache/lucene.git
LUCENE-7440: fix MultiLevelSkipListReader overflow
This commit is contained in:
parent
d59715f14b
commit
cf72eebf75
|
@ -18,6 +18,10 @@ Bug Fixes
|
||||||
trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
|
trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
|
||||||
term. (Thomas Kappler via David Smiley)
|
term. (Thomas Kappler via David Smiley)
|
||||||
|
|
||||||
|
* LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an
|
||||||
|
ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs)
|
||||||
|
with large skips. (yonik)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
|
@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
/** skipInterval of each level. */
|
/** skipInterval of each level. */
|
||||||
private int skipInterval[];
|
private int skipInterval[];
|
||||||
|
|
||||||
/** Number of docs skipped per level. */
|
/** Number of docs skipped per level.
|
||||||
|
* It's possible for some values to overflow a signed int, but this has been accounted for.
|
||||||
|
*/
|
||||||
private int[] numSkipped;
|
private int[] numSkipped;
|
||||||
|
|
||||||
/** Doc id of current skip entry per level. */
|
/** Doc id of current skip entry per level. */
|
||||||
|
@ -151,7 +153,8 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
|
|
||||||
numSkipped[level] += skipInterval[level];
|
numSkipped[level] += skipInterval[level];
|
||||||
|
|
||||||
if (numSkipped[level] > docCount) {
|
// numSkipped may overflow a signed int, so compare as unsigned.
|
||||||
|
if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) {
|
||||||
// this skip list is exhausted
|
// this skip list is exhausted
|
||||||
skipDoc[level] = Integer.MAX_VALUE;
|
skipDoc[level] = Integer.MAX_VALUE;
|
||||||
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
|
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
|
||||||
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||||
|
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase.Monster;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
import org.apache.lucene.util.TimeUnits;
|
||||||
|
|
||||||
|
@SuppressCodecs({"SimpleText", "Memory", "Direct"})
|
||||||
|
@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit
|
||||||
|
@Monster("Takes ~30min")
|
||||||
|
@SuppressSysoutChecks(bugUrl = "Stuff gets printed")
|
||||||
|
public class Test2BDocs extends LuceneTestCase {
|
||||||
|
|
||||||
|
// indexes Integer.MAX_VALUE docs with indexed field(s)
|
||||||
|
public void test2BDocs() throws Exception {
|
||||||
|
BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs"));
|
||||||
|
if (dir instanceof MockDirectoryWrapper) {
|
||||||
|
((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexWriter w = new IndexWriter(dir,
|
||||||
|
new IndexWriterConfig(new MockAnalyzer(random()))
|
||||||
|
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||||
|
.setRAMBufferSizeMB(256.0)
|
||||||
|
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||||
|
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||||
|
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
|
||||||
|
.setCodec(TestUtil.getDefaultCodec()));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED);
|
||||||
|
doc.add(field);
|
||||||
|
|
||||||
|
for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
|
||||||
|
w.addDocument(doc);
|
||||||
|
if (i % (10*1000*1000) == 0) {
|
||||||
|
System.out.println("indexed: " + i);
|
||||||
|
System.out.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.forceMerge(1);
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
System.out.println("verifying...");
|
||||||
|
System.out.flush();
|
||||||
|
|
||||||
|
DirectoryReader r = DirectoryReader.open(dir);
|
||||||
|
|
||||||
|
BytesRef term = new BytesRef(1);
|
||||||
|
term.bytes[0] = (byte)'a';
|
||||||
|
term.length = 1;
|
||||||
|
|
||||||
|
long skips = 0;
|
||||||
|
|
||||||
|
Random rnd = random();
|
||||||
|
|
||||||
|
long start = System.nanoTime();
|
||||||
|
|
||||||
|
for (LeafReaderContext context : r.leaves()) {
|
||||||
|
LeafReader reader = context.reader();
|
||||||
|
int lim = context.reader().maxDoc();
|
||||||
|
|
||||||
|
Terms terms = reader.fields().terms("f1");
|
||||||
|
for (int i=0; i<10000; i++) {
|
||||||
|
TermsEnum te = terms.iterator();
|
||||||
|
assertTrue( te.seekExact(term) );
|
||||||
|
PostingsEnum docs = te.postings(null);
|
||||||
|
|
||||||
|
// skip randomly through the term
|
||||||
|
for (int target = -1;;)
|
||||||
|
{
|
||||||
|
int maxSkipSize = lim - target + 1;
|
||||||
|
// do a smaller skip half of the time
|
||||||
|
if (rnd.nextBoolean()) {
|
||||||
|
maxSkipSize = Math.min(256, maxSkipSize);
|
||||||
|
}
|
||||||
|
int newTarget = target + rnd.nextInt(maxSkipSize) + 1;
|
||||||
|
if (newTarget >= lim) {
|
||||||
|
if (target+1 >= lim) break; // we already skipped to end, so break.
|
||||||
|
newTarget = lim-1; // skip to end
|
||||||
|
}
|
||||||
|
target = newTarget;
|
||||||
|
|
||||||
|
int res = docs.advance(target);
|
||||||
|
if (res == PostingsEnum.NO_MORE_DOCS) break;
|
||||||
|
|
||||||
|
assertTrue( res >= target );
|
||||||
|
|
||||||
|
skips++;
|
||||||
|
target = res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
|
||||||
|
long end = System.nanoTime();
|
||||||
|
|
||||||
|
System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start));
|
||||||
|
assert skips > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue