From 8332668279ff7c649d6fee62a9c5b600707eee68 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sun, 12 Apr 2015 22:43:15 +0000 Subject: [PATCH] LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673075 13f79535-47bb-0310-9956-ffa450edef68 --- .../blocktree/AutoPrefixTermsWriter.java | 95 ++++++++++++------- 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java index 1055c03f2b5..882f5cd35d5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java @@ -211,12 +211,17 @@ class AutoPrefixTermsWriter { } } + // Even though we visited terms in already-sorted order, the prefixes + // can be slightly unsorted, e.g. aaaaa will be before aaa, so we + // must sort here so our caller can do merge sort into actual terms + // when writing. Probably we should use CollectionUtil.timSort here? Collections.sort(prefixes); } /** Pushes the new term to the top of the stack, and writes new blocks. */ private void pushTerm(BytesRef text) throws IOException { int limit = Math.min(lastTerm.length(), text.length); + //if (DEBUG) System.out.println("\nterm: " + text.utf8ToString()); // Find common prefix between last term and current term: int pos = 0; @@ -234,10 +239,10 @@ class AutoPrefixTermsWriter { int prefixTopSize = pending.size() - prefixStarts[i]; while (prefixTopSize >= minItemsInPrefix) { - //if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix); + //if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix); savePrefixes(i+1, prefixTopSize); //prefixStarts[i] -= prefixTopSize; - //System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]); + //if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]); // For large floor blocks, it's possible we should now re-run on the new prefix terms we just created: prefixTopSize = pending.size() - prefixStarts[i]; @@ -267,27 +272,52 @@ class AutoPrefixTermsWriter { assert count > 0; - //if (DEBUG2) { - // BytesRef br = new BytesRef(lastTerm.bytes()); - // br.length = prefixLength; - // System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); - //} + /* + if (DEBUG2) { + BytesRef br = new BytesRef(lastTerm.bytes()); + br.length = prefixLength; + //System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); + System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); + } + */ int lastSuffixLeadLabel = -2; int start = pending.size()-count; assert start >=0; + // Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but + // the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms): + Object o = pending.get(start); + boolean skippedEmptyStringSuffix = false; + if (o instanceof byte[]) { + if (((byte[]) o).length == prefixLength) { + start++; + count--; + //if (DEBUG) System.out.println(" skip empty-string term suffix"); + skippedEmptyStringSuffix = true; + } + } else { + PrefixTerm prefix = (PrefixTerm) o; + if (prefix.term.bytes.length == prefixLength) { + start++; + count--; + //if (DEBUG) System.out.println(" skip empty-string PT suffix"); + skippedEmptyStringSuffix = true; + } + } + int end = pending.size(); int nextBlockStart = start; int nextFloorLeadLabel = -1; int prefixCount = 0; - int pendingCount = 0; + PrefixTerm lastPTEntry = null; + for (int i=start; i 0) { savePrefix(prefixLength, -2, 0xff); prefixCount++; + + // If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we + // are now writing the full aaa* prefix term, we include it here: + if (skippedEmptyStringSuffix) { + count++; + } } else { // Don't add a prefix term for all terms in the index! } @@ -384,16 +413,8 @@ class AutoPrefixTermsWriter { } // Remove slice from the top of the pending stack, that we just wrote: - int sizeToClear = count; - if (prefixCount > 1) { - Object o = pending.get(pending.size()-count); - if (o instanceof byte[] && ((byte[]) o).length == prefixLength) { - // If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain - // as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack: - sizeToClear--; - } - } - pending.subList(pending.size()-sizeToClear, pending.size()).clear(); + + pending.subList(pending.size()-count, pending.size()).clear(); // Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up": for(int i=0;i