LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673075 13f79535-47bb-0310-9956-ffa450edef68
2015-04-12 22:43:15 +00:00 · 2015-04-12 22:43:15 +00:00 · 8332668279
parent b84749fe51
commit 8332668279
1 changed files with 59 additions and 36 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
@ -211,12 +211,17 @@ class AutoPrefixTermsWriter {
      }
    }
    // Even though we visited terms in already-sorted order, the prefixes
    // can be slightly unsorted, e.g. aaaaa will be before aaa, so we
    // must sort here so our caller can do merge sort into actual terms
    // when writing.  Probably we should use CollectionUtil.timSort here?
    Collections.sort(prefixes);
  }
  /** Pushes the new term to the top of the stack, and writes new blocks. */
  private void pushTerm(BytesRef text) throws IOException {
    int limit = Math.min(lastTerm.length(), text.length);
    //if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
    // Find common prefix between last term and current term:
    int pos = 0;
@ -234,10 +239,10 @@ class AutoPrefixTermsWriter {
      int prefixTopSize = pending.size() - prefixStarts[i];
      while (prefixTopSize >= minItemsInPrefix) {       
-        //if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
+        //if (DEBUG) System.out.println("  pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
        savePrefixes(i+1, prefixTopSize);
        //prefixStarts[i] -= prefixTopSize;
-        //System.out.println("    after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
+        //if (DEBUG) System.out.println("    after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
        // For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
        prefixTopSize = pending.size() - prefixStarts[i];
@ -267,27 +272,52 @@ class AutoPrefixTermsWriter {
    assert count > 0;
-    //if (DEBUG2) {
+    /*
-    //  BytesRef br = new BytesRef(lastTerm.bytes());
+    if (DEBUG2) {
-    //  br.length = prefixLength;
+      BytesRef br = new BytesRef(lastTerm.bytes());
-    //  System.out.println("  savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
+      br.length = prefixLength;
-    //}
+      //System.out.println("  savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
      System.out.println("  savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
    }
    */
    int lastSuffixLeadLabel = -2;
    int start = pending.size()-count;
    assert start >=0;
    // Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but 
    // the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
    Object o = pending.get(start);
    boolean skippedEmptyStringSuffix = false;
    if (o instanceof byte[]) {
      if (((byte[]) o).length == prefixLength) {
        start++;
        count--;
        //if (DEBUG) System.out.println("  skip empty-string term suffix");
        skippedEmptyStringSuffix = true;
      }
    } else {
      PrefixTerm prefix = (PrefixTerm) o;
      if (prefix.term.bytes.length == prefixLength) {
        start++;
        count--;
        //if (DEBUG) System.out.println("  skip empty-string PT suffix");
        skippedEmptyStringSuffix = true;
      }
    }
    int end = pending.size();
    int nextBlockStart = start;
    int nextFloorLeadLabel = -1;
    int prefixCount = 0;
-    int pendingCount = 0;
+
    PrefixTerm lastPTEntry = null;
    for (int i=start; i<end; i++) {
      byte[] termBytes;
-      Object o = pending.get(i);
+      o = pending.get(i);
      PrefixTerm ptEntry;
      if (o instanceof byte[]) {
        ptEntry = null;
@ -300,23 +330,15 @@ class AutoPrefixTermsWriter {
          ptEntry = null;
        }
      }
      pendingCount++;
-      //if (DEBUG) System.out.println("    check term=" + brToString(new BytesRef(termBytes)));
+      //if (DEBUG) System.out.println("    check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
-      int suffixLeadLabel;
+      // We handled the empty-string suffix case up front:
      assert termBytes.length > prefixLength;
-      if (termBytes.length == prefixLength) {
+      int suffixLeadLabel = termBytes[prefixLength] & 0xff;
        // Suffix is 0, i.e. prefix 'foo' and term is
        // 'foo' so the term has empty string suffix
        // in this block
        assert lastSuffixLeadLabel == -2;
        suffixLeadLabel = -2;
      } else {
        suffixLeadLabel = termBytes[prefixLength] & 0xff;
      }
-      // if (DEBUG) System.out.println("  i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel);
+      //if (DEBUG) System.out.println("  i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
      if (suffixLeadLabel != lastSuffixLeadLabel) {
        // This is a boundary, a chance to make an auto-prefix term if we want:
@ -327,8 +349,9 @@ class AutoPrefixTermsWriter {
        // than the lead start of the current entry:
        assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
-        // NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa'
+        int itemsInBlock = i - nextBlockStart;
-        if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
+
        if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
          // The count is too large for one block, so we must break it into "floor" blocks, where we record
          // the leading label of the suffix of the first term in each floor block, so at search time we can
          // jump to the right floor block.  We just use a naive greedy segmenter here: make a new floor
@ -338,11 +361,10 @@ class AutoPrefixTermsWriter {
          // If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
          // prefix label as our ending label:
          if (lastPTEntry != null) {
            //if (DEBUG) System.out.println("  use last");
            lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
          }
          savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
          pendingCount = 0;
          prefixCount++;
          nextFloorLeadLabel = suffixLeadLabel;
@ -356,6 +378,7 @@ class AutoPrefixTermsWriter {
        lastSuffixLeadLabel = suffixLeadLabel;
      }
      lastPTEntry = ptEntry;
    }
@ -370,6 +393,12 @@ class AutoPrefixTermsWriter {
        if (prefixLength > 0) {
          savePrefix(prefixLength, -2, 0xff);
          prefixCount++;
          // If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
          // are now writing the full aaa* prefix term, we include it here:
          if (skippedEmptyStringSuffix) {
            count++;
          }
        } else {
          // Don't add a prefix term for all terms in the index!
        }
@ -384,16 +413,8 @@ class AutoPrefixTermsWriter {
    }
    // Remove slice from the top of the pending stack, that we just wrote:
-    int sizeToClear = count;
+
-    if (prefixCount > 1) {
+    pending.subList(pending.size()-count, pending.size()).clear();
      Object o = pending.get(pending.size()-count);
      if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
        // If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
        // as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
        sizeToClear--;
      }
    }
    pending.subList(pending.size()-sizeToClear, pending.size()).clear();
    // Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
    for(int i=0;i<prefixCount;i++) {
@ -410,6 +431,8 @@ class AutoPrefixTermsWriter {
    PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd); 
    //if (DEBUG2) System.out.println("    savePrefix: seg=" + segment + " " + pt + " count=" + count);
    //if (DEBUG) System.out.println("    savePrefix: " + pt);
    prefixes.add(pt);
  }
 }