LUCENE-7440: fix MultiLevelSkipListReader overflow

2016-09-10 15:58:24 -04:00 · 2016-09-10 15:58:24 -04:00 · c929d0595c
parent 5aedab619a
commit c929d0595c
3 changed files with 145 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -49,6 +49,10 @@ Bug Fixes
  trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
  term.  (Thomas Kappler via David Smiley)

+* LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an
+  ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs)
+  with large skips. (yonik)
+
 Improvements

 Optimizations
--- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
  /**  skipInterval of each level. */
  private int skipInterval[];

-  /** Number of docs skipped per level. */
+  /** Number of docs skipped per level.
+   * It's possible for some values to overflow a signed int, but this has been accounted for.
+   */
  private int[] numSkipped;

  /** Doc id of current skip entry per level. */
@ -151,7 +153,8 @@ public abstract class MultiLevelSkipListReader implements Closeable {
      
    numSkipped[level] += skipInterval[level];

-    if (numSkipped[level] > docCount) {
+    // numSkipped may overflow a signed int, so compare as unsigned.
+    if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) {
      // this skip list is exhausted
      skipDoc[level] = Integer.MAX_VALUE;
      if (numberOfSkipLevels > level) numberOfSkipLevels = level; 
--- a/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
+++ b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.store.BaseDirectoryWrapper;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LuceneTestCase.Monster;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.TimeUnits;
+
+@SuppressCodecs({"SimpleText", "Memory", "Direct"})
+@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit
+@Monster("Takes ~30min")
+@SuppressSysoutChecks(bugUrl = "Stuff gets printed")
+public class Test2BDocs extends LuceneTestCase {
+  
+  // indexes Integer.MAX_VALUE docs with indexed field(s)
+  public void test2BDocs() throws Exception {
+    BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs"));
+    if (dir instanceof MockDirectoryWrapper) {
+      ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
+    }
+    
+    IndexWriter w = new IndexWriter(dir,
+        new IndexWriterConfig(new MockAnalyzer(random()))
+        .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+        .setRAMBufferSizeMB(256.0)
+        .setMergeScheduler(new ConcurrentMergeScheduler())
+        .setMergePolicy(newLogMergePolicy(false, 10))
+        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
+        .setCodec(TestUtil.getDefaultCodec()));
+
+    Document doc = new Document();
+    Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED);
+    doc.add(field);
+    
+    for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
+      w.addDocument(doc);
+      if (i % (10*1000*1000) == 0) {
+        System.out.println("indexed: " + i);
+        System.out.flush();
+      }
+    }
+    
+    w.forceMerge(1);
+    w.close();
+    
+    System.out.println("verifying...");
+    System.out.flush();
+    
+    DirectoryReader r = DirectoryReader.open(dir);
+
+    BytesRef term = new BytesRef(1);
+    term.bytes[0] = (byte)'a';
+    term.length = 1;
+
+    long skips = 0;
+
+    Random rnd = random();
+
+    long start = System.nanoTime();
+
+    for (LeafReaderContext context : r.leaves()) {
+      LeafReader reader = context.reader();
+      int lim = context.reader().maxDoc();
+
+      Terms terms = reader.fields().terms("f1");
+      for (int i=0; i<10000; i++) {
+        TermsEnum te = terms.iterator();
+        assertTrue( te.seekExact(term) );
+        PostingsEnum docs = te.postings(null);
+
+        // skip randomly through the term
+        for (int target = -1;;)
+        {
+          int maxSkipSize = lim - target + 1;
+          // do a smaller skip half of the time
+          if (rnd.nextBoolean()) {
+            maxSkipSize = Math.min(256, maxSkipSize);
+          }
+          int newTarget = target + rnd.nextInt(maxSkipSize) + 1;
+          if (newTarget >= lim) {
+            if (target+1 >= lim) break; // we already skipped to end, so break.
+            newTarget = lim-1;  // skip to end
+          }
+          target = newTarget;
+
+          int res = docs.advance(target);
+          if (res == PostingsEnum.NO_MORE_DOCS) break;
+
+          assertTrue( res >= target );
+
+          skips++;
+          target = res;
+        }
+      }
+    }
+    
+    r.close();
+    dir.close();
+
+    long end = System.nanoTime();
+
+    System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start));
+    assert skips > 0;
+  }
+  
+}