LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-04-12 22:43:15 +00:00
parent b84749fe51
commit 8332668279
1 changed files with 59 additions and 36 deletions

View File

@ -211,12 +211,17 @@ class AutoPrefixTermsWriter {
} }
} }
// Even though we visited terms in already-sorted order, the prefixes
// can be slightly unsorted, e.g. aaaaa will be before aaa, so we
// must sort here so our caller can do merge sort into actual terms
// when writing. Probably we should use CollectionUtil.timSort here?
Collections.sort(prefixes); Collections.sort(prefixes);
} }
/** Pushes the new term to the top of the stack, and writes new blocks. */ /** Pushes the new term to the top of the stack, and writes new blocks. */
private void pushTerm(BytesRef text) throws IOException { private void pushTerm(BytesRef text) throws IOException {
int limit = Math.min(lastTerm.length(), text.length); int limit = Math.min(lastTerm.length(), text.length);
//if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
// Find common prefix between last term and current term: // Find common prefix between last term and current term:
int pos = 0; int pos = 0;
@ -234,10 +239,10 @@ class AutoPrefixTermsWriter {
int prefixTopSize = pending.size() - prefixStarts[i]; int prefixTopSize = pending.size() - prefixStarts[i];
while (prefixTopSize >= minItemsInPrefix) { while (prefixTopSize >= minItemsInPrefix) {
//if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix); //if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
savePrefixes(i+1, prefixTopSize); savePrefixes(i+1, prefixTopSize);
//prefixStarts[i] -= prefixTopSize; //prefixStarts[i] -= prefixTopSize;
//System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]); //if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created: // For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
prefixTopSize = pending.size() - prefixStarts[i]; prefixTopSize = pending.size() - prefixStarts[i];
@ -267,27 +272,52 @@ class AutoPrefixTermsWriter {
assert count > 0; assert count > 0;
//if (DEBUG2) { /*
// BytesRef br = new BytesRef(lastTerm.bytes()); if (DEBUG2) {
// br.length = prefixLength; BytesRef br = new BytesRef(lastTerm.bytes());
// System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); br.length = prefixLength;
//} //System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
}
*/
int lastSuffixLeadLabel = -2; int lastSuffixLeadLabel = -2;
int start = pending.size()-count; int start = pending.size()-count;
assert start >=0; assert start >=0;
// Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but
// the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
Object o = pending.get(start);
boolean skippedEmptyStringSuffix = false;
if (o instanceof byte[]) {
if (((byte[]) o).length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string term suffix");
skippedEmptyStringSuffix = true;
}
} else {
PrefixTerm prefix = (PrefixTerm) o;
if (prefix.term.bytes.length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string PT suffix");
skippedEmptyStringSuffix = true;
}
}
int end = pending.size(); int end = pending.size();
int nextBlockStart = start; int nextBlockStart = start;
int nextFloorLeadLabel = -1; int nextFloorLeadLabel = -1;
int prefixCount = 0; int prefixCount = 0;
int pendingCount = 0;
PrefixTerm lastPTEntry = null; PrefixTerm lastPTEntry = null;
for (int i=start; i<end; i++) { for (int i=start; i<end; i++) {
byte[] termBytes; byte[] termBytes;
Object o = pending.get(i); o = pending.get(i);
PrefixTerm ptEntry; PrefixTerm ptEntry;
if (o instanceof byte[]) { if (o instanceof byte[]) {
ptEntry = null; ptEntry = null;
@ -300,23 +330,15 @@ class AutoPrefixTermsWriter {
ptEntry = null; ptEntry = null;
} }
} }
pendingCount++;
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes))); //if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
int suffixLeadLabel; // We handled the empty-string suffix case up front:
assert termBytes.length > prefixLength;
if (termBytes.length == prefixLength) { int suffixLeadLabel = termBytes[prefixLength] & 0xff;
// Suffix is 0, i.e. prefix 'foo' and term is
// 'foo' so the term has empty string suffix
// in this block
assert lastSuffixLeadLabel == -2;
suffixLeadLabel = -2;
} else {
suffixLeadLabel = termBytes[prefixLength] & 0xff;
}
// if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel); //if (DEBUG) System.out.println(" i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
if (suffixLeadLabel != lastSuffixLeadLabel) { if (suffixLeadLabel != lastSuffixLeadLabel) {
// This is a boundary, a chance to make an auto-prefix term if we want: // This is a boundary, a chance to make an auto-prefix term if we want:
@ -327,8 +349,9 @@ class AutoPrefixTermsWriter {
// than the lead start of the current entry: // than the lead start of the current entry:
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel; assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
// NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa' int itemsInBlock = i - nextBlockStart;
if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
// The count is too large for one block, so we must break it into "floor" blocks, where we record // The count is too large for one block, so we must break it into "floor" blocks, where we record
// the leading label of the suffix of the first term in each floor block, so at search time we can // the leading label of the suffix of the first term in each floor block, so at search time we can
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor // jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
@ -338,11 +361,10 @@ class AutoPrefixTermsWriter {
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending // If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
// prefix label as our ending label: // prefix label as our ending label:
if (lastPTEntry != null) { if (lastPTEntry != null) {
//if (DEBUG) System.out.println(" use last");
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd; lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
} }
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel); savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
pendingCount = 0;
prefixCount++; prefixCount++;
nextFloorLeadLabel = suffixLeadLabel; nextFloorLeadLabel = suffixLeadLabel;
@ -356,6 +378,7 @@ class AutoPrefixTermsWriter {
lastSuffixLeadLabel = suffixLeadLabel; lastSuffixLeadLabel = suffixLeadLabel;
} }
lastPTEntry = ptEntry; lastPTEntry = ptEntry;
} }
@ -370,6 +393,12 @@ class AutoPrefixTermsWriter {
if (prefixLength > 0) { if (prefixLength > 0) {
savePrefix(prefixLength, -2, 0xff); savePrefix(prefixLength, -2, 0xff);
prefixCount++; prefixCount++;
// If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
// are now writing the full aaa* prefix term, we include it here:
if (skippedEmptyStringSuffix) {
count++;
}
} else { } else {
// Don't add a prefix term for all terms in the index! // Don't add a prefix term for all terms in the index!
} }
@ -384,16 +413,8 @@ class AutoPrefixTermsWriter {
} }
// Remove slice from the top of the pending stack, that we just wrote: // Remove slice from the top of the pending stack, that we just wrote:
int sizeToClear = count;
if (prefixCount > 1) { pending.subList(pending.size()-count, pending.size()).clear();
Object o = pending.get(pending.size()-count);
if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
// If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
// as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
sizeToClear--;
}
}
pending.subList(pending.size()-sizeToClear, pending.size()).clear();
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up": // Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
for(int i=0;i<prefixCount;i++) { for(int i=0;i<prefixCount;i++) {
@ -410,6 +431,8 @@ class AutoPrefixTermsWriter {
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd); PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count); //if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
//if (DEBUG) System.out.println(" savePrefix: " + pt);
prefixes.add(pt); prefixes.add(pt);
} }
} }