LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-04-12 22:43:15 +00:00
parent b84749fe51
commit 8332668279
1 changed files with 59 additions and 36 deletions

View File

@ -211,12 +211,17 @@ class AutoPrefixTermsWriter {
}
}
// Even though we visited terms in already-sorted order, the prefixes
// can be slightly unsorted, e.g. aaaaa will be before aaa, so we
// must sort here so our caller can do merge sort into actual terms
// when writing. Probably we should use CollectionUtil.timSort here?
Collections.sort(prefixes);
}
/** Pushes the new term to the top of the stack, and writes new blocks. */
private void pushTerm(BytesRef text) throws IOException {
int limit = Math.min(lastTerm.length(), text.length);
//if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
// Find common prefix between last term and current term:
int pos = 0;
@ -234,10 +239,10 @@ class AutoPrefixTermsWriter {
int prefixTopSize = pending.size() - prefixStarts[i];
while (prefixTopSize >= minItemsInPrefix) {
//if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
//if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
savePrefixes(i+1, prefixTopSize);
//prefixStarts[i] -= prefixTopSize;
//System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
//if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
prefixTopSize = pending.size() - prefixStarts[i];
@ -267,27 +272,52 @@ class AutoPrefixTermsWriter {
assert count > 0;
//if (DEBUG2) {
// BytesRef br = new BytesRef(lastTerm.bytes());
// br.length = prefixLength;
// System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
//}
/*
if (DEBUG2) {
BytesRef br = new BytesRef(lastTerm.bytes());
br.length = prefixLength;
//System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
}
*/
int lastSuffixLeadLabel = -2;
int start = pending.size()-count;
assert start >=0;
// Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but
// the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
Object o = pending.get(start);
boolean skippedEmptyStringSuffix = false;
if (o instanceof byte[]) {
if (((byte[]) o).length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string term suffix");
skippedEmptyStringSuffix = true;
}
} else {
PrefixTerm prefix = (PrefixTerm) o;
if (prefix.term.bytes.length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string PT suffix");
skippedEmptyStringSuffix = true;
}
}
int end = pending.size();
int nextBlockStart = start;
int nextFloorLeadLabel = -1;
int prefixCount = 0;
int pendingCount = 0;
PrefixTerm lastPTEntry = null;
for (int i=start; i<end; i++) {
byte[] termBytes;
Object o = pending.get(i);
o = pending.get(i);
PrefixTerm ptEntry;
if (o instanceof byte[]) {
ptEntry = null;
@ -300,23 +330,15 @@ class AutoPrefixTermsWriter {
ptEntry = null;
}
}
pendingCount++;
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)));
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
int suffixLeadLabel;
// We handled the empty-string suffix case up front:
assert termBytes.length > prefixLength;
if (termBytes.length == prefixLength) {
// Suffix is 0, i.e. prefix 'foo' and term is
// 'foo' so the term has empty string suffix
// in this block
assert lastSuffixLeadLabel == -2;
suffixLeadLabel = -2;
} else {
suffixLeadLabel = termBytes[prefixLength] & 0xff;
}
int suffixLeadLabel = termBytes[prefixLength] & 0xff;
// if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel);
//if (DEBUG) System.out.println(" i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
if (suffixLeadLabel != lastSuffixLeadLabel) {
// This is a boundary, a chance to make an auto-prefix term if we want:
@ -327,8 +349,9 @@ class AutoPrefixTermsWriter {
// than the lead start of the current entry:
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
// NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa'
if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
int itemsInBlock = i - nextBlockStart;
if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
// The count is too large for one block, so we must break it into "floor" blocks, where we record
// the leading label of the suffix of the first term in each floor block, so at search time we can
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
@ -338,11 +361,10 @@ class AutoPrefixTermsWriter {
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
// prefix label as our ending label:
if (lastPTEntry != null) {
//if (DEBUG) System.out.println(" use last");
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
}
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
pendingCount = 0;
prefixCount++;
nextFloorLeadLabel = suffixLeadLabel;
@ -356,6 +378,7 @@ class AutoPrefixTermsWriter {
lastSuffixLeadLabel = suffixLeadLabel;
}
lastPTEntry = ptEntry;
}
@ -370,6 +393,12 @@ class AutoPrefixTermsWriter {
if (prefixLength > 0) {
savePrefix(prefixLength, -2, 0xff);
prefixCount++;
// If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
// are now writing the full aaa* prefix term, we include it here:
if (skippedEmptyStringSuffix) {
count++;
}
} else {
// Don't add a prefix term for all terms in the index!
}
@ -384,16 +413,8 @@ class AutoPrefixTermsWriter {
}
// Remove slice from the top of the pending stack, that we just wrote:
int sizeToClear = count;
if (prefixCount > 1) {
Object o = pending.get(pending.size()-count);
if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
// If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
// as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
sizeToClear--;
}
}
pending.subList(pending.size()-sizeToClear, pending.size()).clear();
pending.subList(pending.size()-count, pending.size()).clear();
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
for(int i=0;i<prefixCount;i++) {
@ -410,6 +431,8 @@ class AutoPrefixTermsWriter {
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
//if (DEBUG) System.out.println(" savePrefix: " + pt);
prefixes.add(pt);
}
}