LUCENE-7440: fix MultiLevelSkipListReader overflow

2016-09-10 15:58:24 -04:00 · 2016-09-10 15:58:24 -04:00 · c929d0595c
parent 5aedab619a
commit c929d0595c
3 changed files with 145 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -49,6 +49,10 @@ Bug Fixes
  trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
  term.  (Thomas Kappler via David Smiley)
 * LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an
  ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs)
  with large skips. (yonik)
 Improvements
 Optimizations
--- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
  /**  skipInterval of each level. */
  private int skipInterval[];
-  /** Number of docs skipped per level. */
+  /** Number of docs skipped per level.
   * It's possible for some values to overflow a signed int, but this has been accounted for.
   */
  private int[] numSkipped;
  /** Doc id of current skip entry per level. */
@ -151,7 +153,8 @@ public abstract class MultiLevelSkipListReader implements Closeable {
    numSkipped[level] += skipInterval[level];
-    if (numSkipped[level] > docCount) {
+    // numSkipped may overflow a signed int, so compare as unsigned.
    if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) {
      // this skip list is exhausted
      skipDoc[level] = Integer.MAX_VALUE;
      if (numberOfSkipLevels > level) numberOfSkipLevels = level; 
--- a/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
+++ b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
@ -0,0 +1,135 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.index;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.store.MockDirectoryWrapper;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Monster;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.TimeUnits;
@SuppressCodecs({"SimpleText", "Memory", "Direct"})
@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit
@Monster("Takes ~30min")
@SuppressSysoutChecks(bugUrl = "Stuff gets printed")
 public class Test2BDocs extends LuceneTestCase {
  // indexes Integer.MAX_VALUE docs with indexed field(s)
  public void test2BDocs() throws Exception {
    BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs"));
    if (dir instanceof MockDirectoryWrapper) {
      ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
    }
    IndexWriter w = new IndexWriter(dir,
        new IndexWriterConfig(new MockAnalyzer(random()))
        .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
        .setRAMBufferSizeMB(256.0)
        .setMergeScheduler(new ConcurrentMergeScheduler())
        .setMergePolicy(newLogMergePolicy(false, 10))
        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        .setCodec(TestUtil.getDefaultCodec()));
    Document doc = new Document();
    Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED);
    doc.add(field);
    for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
      w.addDocument(doc);
      if (i % (10*1000*1000) == 0) {
        System.out.println("indexed: " + i);
        System.out.flush();
      }
    }
    w.forceMerge(1);
    w.close();
    System.out.println("verifying...");
    System.out.flush();
    DirectoryReader r = DirectoryReader.open(dir);
    BytesRef term = new BytesRef(1);
    term.bytes[0] = (byte)'a';
    term.length = 1;
    long skips = 0;
    Random rnd = random();
    long start = System.nanoTime();
    for (LeafReaderContext context : r.leaves()) {
      LeafReader reader = context.reader();
      int lim = context.reader().maxDoc();
      Terms terms = reader.fields().terms("f1");
      for (int i=0; i<10000; i++) {
        TermsEnum te = terms.iterator();
        assertTrue( te.seekExact(term) );
        PostingsEnum docs = te.postings(null);
        // skip randomly through the term
        for (int target = -1;;)
        {
          int maxSkipSize = lim - target + 1;
          // do a smaller skip half of the time
          if (rnd.nextBoolean()) {
            maxSkipSize = Math.min(256, maxSkipSize);
          }
          int newTarget = target + rnd.nextInt(maxSkipSize) + 1;
          if (newTarget >= lim) {
            if (target+1 >= lim) break; // we already skipped to end, so break.
            newTarget = lim-1;  // skip to end
          }
          target = newTarget;
          int res = docs.advance(target);
          if (res == PostingsEnum.NO_MORE_DOCS) break;
          assertTrue( res >= target );
          skips++;
          target = res;
        }
      }
    }
    r.close();
    dir.close();
    long end = System.nanoTime();
    System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start));
    assert skips > 0;
  }
 }