mirror of https://github.com/apache/lucene.git
LUCENE-7440: fix MultiLevelSkipListReader overflow
This commit is contained in:
parent
d59715f14b
commit
cf72eebf75
|
@ -18,6 +18,10 @@ Bug Fixes
|
|||
trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
|
||||
term. (Thomas Kappler via David Smiley)
|
||||
|
||||
* LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an
|
||||
ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs)
|
||||
with large skips. (yonik)
|
||||
|
||||
Improvements
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
/** skipInterval of each level. */
|
||||
private int skipInterval[];
|
||||
|
||||
/** Number of docs skipped per level. */
|
||||
/** Number of docs skipped per level.
|
||||
* It's possible for some values to overflow a signed int, but this has been accounted for.
|
||||
*/
|
||||
private int[] numSkipped;
|
||||
|
||||
/** Doc id of current skip entry per level. */
|
||||
|
@ -150,8 +152,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
setLastSkipData(level);
|
||||
|
||||
numSkipped[level] += skipInterval[level];
|
||||
|
||||
if (numSkipped[level] > docCount) {
|
||||
|
||||
// numSkipped may overflow a signed int, so compare as unsigned.
|
||||
if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) {
|
||||
// this skip list is exhausted
|
||||
skipDoc[level] = Integer.MAX_VALUE;
|
||||
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
|
||||
|
|
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.Monster;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.TimeUnits;
|
||||
|
||||
@SuppressCodecs({"SimpleText", "Memory", "Direct"})
|
||||
@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit
|
||||
@Monster("Takes ~30min")
|
||||
@SuppressSysoutChecks(bugUrl = "Stuff gets printed")
|
||||
public class Test2BDocs extends LuceneTestCase {
|
||||
|
||||
// indexes Integer.MAX_VALUE docs with indexed field(s)
|
||||
public void test2BDocs() throws Exception {
|
||||
BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs"));
|
||||
if (dir instanceof MockDirectoryWrapper) {
|
||||
((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
|
||||
}
|
||||
|
||||
IndexWriter w = new IndexWriter(dir,
|
||||
new IndexWriterConfig(new MockAnalyzer(random()))
|
||||
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||
.setRAMBufferSizeMB(256.0)
|
||||
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
|
||||
.setCodec(TestUtil.getDefaultCodec()));
|
||||
|
||||
Document doc = new Document();
|
||||
Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED);
|
||||
doc.add(field);
|
||||
|
||||
for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
|
||||
w.addDocument(doc);
|
||||
if (i % (10*1000*1000) == 0) {
|
||||
System.out.println("indexed: " + i);
|
||||
System.out.flush();
|
||||
}
|
||||
}
|
||||
|
||||
w.forceMerge(1);
|
||||
w.close();
|
||||
|
||||
System.out.println("verifying...");
|
||||
System.out.flush();
|
||||
|
||||
DirectoryReader r = DirectoryReader.open(dir);
|
||||
|
||||
BytesRef term = new BytesRef(1);
|
||||
term.bytes[0] = (byte)'a';
|
||||
term.length = 1;
|
||||
|
||||
long skips = 0;
|
||||
|
||||
Random rnd = random();
|
||||
|
||||
long start = System.nanoTime();
|
||||
|
||||
for (LeafReaderContext context : r.leaves()) {
|
||||
LeafReader reader = context.reader();
|
||||
int lim = context.reader().maxDoc();
|
||||
|
||||
Terms terms = reader.fields().terms("f1");
|
||||
for (int i=0; i<10000; i++) {
|
||||
TermsEnum te = terms.iterator();
|
||||
assertTrue( te.seekExact(term) );
|
||||
PostingsEnum docs = te.postings(null);
|
||||
|
||||
// skip randomly through the term
|
||||
for (int target = -1;;)
|
||||
{
|
||||
int maxSkipSize = lim - target + 1;
|
||||
// do a smaller skip half of the time
|
||||
if (rnd.nextBoolean()) {
|
||||
maxSkipSize = Math.min(256, maxSkipSize);
|
||||
}
|
||||
int newTarget = target + rnd.nextInt(maxSkipSize) + 1;
|
||||
if (newTarget >= lim) {
|
||||
if (target+1 >= lim) break; // we already skipped to end, so break.
|
||||
newTarget = lim-1; // skip to end
|
||||
}
|
||||
target = newTarget;
|
||||
|
||||
int res = docs.advance(target);
|
||||
if (res == PostingsEnum.NO_MORE_DOCS) break;
|
||||
|
||||
assertTrue( res >= target );
|
||||
|
||||
skips++;
|
||||
target = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
|
||||
long end = System.nanoTime();
|
||||
|
||||
System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start));
|
||||
assert skips > 0;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue