mirror of https://github.com/apache/lucene.git
LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b84749fe51
commit
8332668279
|
@ -211,12 +211,17 @@ class AutoPrefixTermsWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Even though we visited terms in already-sorted order, the prefixes
|
||||||
|
// can be slightly unsorted, e.g. aaaaa will be before aaa, so we
|
||||||
|
// must sort here so our caller can do merge sort into actual terms
|
||||||
|
// when writing. Probably we should use CollectionUtil.timSort here?
|
||||||
Collections.sort(prefixes);
|
Collections.sort(prefixes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Pushes the new term to the top of the stack, and writes new blocks. */
|
/** Pushes the new term to the top of the stack, and writes new blocks. */
|
||||||
private void pushTerm(BytesRef text) throws IOException {
|
private void pushTerm(BytesRef text) throws IOException {
|
||||||
int limit = Math.min(lastTerm.length(), text.length);
|
int limit = Math.min(lastTerm.length(), text.length);
|
||||||
|
//if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
|
||||||
|
|
||||||
// Find common prefix between last term and current term:
|
// Find common prefix between last term and current term:
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
|
@ -234,10 +239,10 @@ class AutoPrefixTermsWriter {
|
||||||
int prefixTopSize = pending.size() - prefixStarts[i];
|
int prefixTopSize = pending.size() - prefixStarts[i];
|
||||||
|
|
||||||
while (prefixTopSize >= minItemsInPrefix) {
|
while (prefixTopSize >= minItemsInPrefix) {
|
||||||
//if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
|
//if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
|
||||||
savePrefixes(i+1, prefixTopSize);
|
savePrefixes(i+1, prefixTopSize);
|
||||||
//prefixStarts[i] -= prefixTopSize;
|
//prefixStarts[i] -= prefixTopSize;
|
||||||
//System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
|
//if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
|
||||||
|
|
||||||
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
|
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
|
||||||
prefixTopSize = pending.size() - prefixStarts[i];
|
prefixTopSize = pending.size() - prefixStarts[i];
|
||||||
|
@ -267,27 +272,52 @@ class AutoPrefixTermsWriter {
|
||||||
|
|
||||||
assert count > 0;
|
assert count > 0;
|
||||||
|
|
||||||
//if (DEBUG2) {
|
/*
|
||||||
// BytesRef br = new BytesRef(lastTerm.bytes());
|
if (DEBUG2) {
|
||||||
// br.length = prefixLength;
|
BytesRef br = new BytesRef(lastTerm.bytes());
|
||||||
// System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
|
br.length = prefixLength;
|
||||||
//}
|
//System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
|
||||||
|
System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
int lastSuffixLeadLabel = -2;
|
int lastSuffixLeadLabel = -2;
|
||||||
|
|
||||||
int start = pending.size()-count;
|
int start = pending.size()-count;
|
||||||
assert start >=0;
|
assert start >=0;
|
||||||
|
|
||||||
|
// Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but
|
||||||
|
// the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
|
||||||
|
Object o = pending.get(start);
|
||||||
|
boolean skippedEmptyStringSuffix = false;
|
||||||
|
if (o instanceof byte[]) {
|
||||||
|
if (((byte[]) o).length == prefixLength) {
|
||||||
|
start++;
|
||||||
|
count--;
|
||||||
|
//if (DEBUG) System.out.println(" skip empty-string term suffix");
|
||||||
|
skippedEmptyStringSuffix = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PrefixTerm prefix = (PrefixTerm) o;
|
||||||
|
if (prefix.term.bytes.length == prefixLength) {
|
||||||
|
start++;
|
||||||
|
count--;
|
||||||
|
//if (DEBUG) System.out.println(" skip empty-string PT suffix");
|
||||||
|
skippedEmptyStringSuffix = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int end = pending.size();
|
int end = pending.size();
|
||||||
int nextBlockStart = start;
|
int nextBlockStart = start;
|
||||||
int nextFloorLeadLabel = -1;
|
int nextFloorLeadLabel = -1;
|
||||||
int prefixCount = 0;
|
int prefixCount = 0;
|
||||||
int pendingCount = 0;
|
|
||||||
PrefixTerm lastPTEntry = null;
|
PrefixTerm lastPTEntry = null;
|
||||||
|
|
||||||
for (int i=start; i<end; i++) {
|
for (int i=start; i<end; i++) {
|
||||||
|
|
||||||
byte[] termBytes;
|
byte[] termBytes;
|
||||||
Object o = pending.get(i);
|
o = pending.get(i);
|
||||||
PrefixTerm ptEntry;
|
PrefixTerm ptEntry;
|
||||||
if (o instanceof byte[]) {
|
if (o instanceof byte[]) {
|
||||||
ptEntry = null;
|
ptEntry = null;
|
||||||
|
@ -300,23 +330,15 @@ class AutoPrefixTermsWriter {
|
||||||
ptEntry = null;
|
ptEntry = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pendingCount++;
|
|
||||||
|
|
||||||
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)));
|
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
|
||||||
|
|
||||||
int suffixLeadLabel;
|
// We handled the empty-string suffix case up front:
|
||||||
|
assert termBytes.length > prefixLength;
|
||||||
|
|
||||||
if (termBytes.length == prefixLength) {
|
int suffixLeadLabel = termBytes[prefixLength] & 0xff;
|
||||||
// Suffix is 0, i.e. prefix 'foo' and term is
|
|
||||||
// 'foo' so the term has empty string suffix
|
|
||||||
// in this block
|
|
||||||
assert lastSuffixLeadLabel == -2;
|
|
||||||
suffixLeadLabel = -2;
|
|
||||||
} else {
|
|
||||||
suffixLeadLabel = termBytes[prefixLength] & 0xff;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel);
|
//if (DEBUG) System.out.println(" i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
|
||||||
|
|
||||||
if (suffixLeadLabel != lastSuffixLeadLabel) {
|
if (suffixLeadLabel != lastSuffixLeadLabel) {
|
||||||
// This is a boundary, a chance to make an auto-prefix term if we want:
|
// This is a boundary, a chance to make an auto-prefix term if we want:
|
||||||
|
@ -327,8 +349,9 @@ class AutoPrefixTermsWriter {
|
||||||
// than the lead start of the current entry:
|
// than the lead start of the current entry:
|
||||||
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
|
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
|
||||||
|
|
||||||
// NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa'
|
int itemsInBlock = i - nextBlockStart;
|
||||||
if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
|
|
||||||
|
if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
|
||||||
// The count is too large for one block, so we must break it into "floor" blocks, where we record
|
// The count is too large for one block, so we must break it into "floor" blocks, where we record
|
||||||
// the leading label of the suffix of the first term in each floor block, so at search time we can
|
// the leading label of the suffix of the first term in each floor block, so at search time we can
|
||||||
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
|
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
|
||||||
|
@ -338,11 +361,10 @@ class AutoPrefixTermsWriter {
|
||||||
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
|
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
|
||||||
// prefix label as our ending label:
|
// prefix label as our ending label:
|
||||||
if (lastPTEntry != null) {
|
if (lastPTEntry != null) {
|
||||||
|
//if (DEBUG) System.out.println(" use last");
|
||||||
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
|
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
|
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
|
||||||
pendingCount = 0;
|
|
||||||
|
|
||||||
prefixCount++;
|
prefixCount++;
|
||||||
nextFloorLeadLabel = suffixLeadLabel;
|
nextFloorLeadLabel = suffixLeadLabel;
|
||||||
|
@ -356,6 +378,7 @@ class AutoPrefixTermsWriter {
|
||||||
|
|
||||||
lastSuffixLeadLabel = suffixLeadLabel;
|
lastSuffixLeadLabel = suffixLeadLabel;
|
||||||
}
|
}
|
||||||
|
|
||||||
lastPTEntry = ptEntry;
|
lastPTEntry = ptEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -370,6 +393,12 @@ class AutoPrefixTermsWriter {
|
||||||
if (prefixLength > 0) {
|
if (prefixLength > 0) {
|
||||||
savePrefix(prefixLength, -2, 0xff);
|
savePrefix(prefixLength, -2, 0xff);
|
||||||
prefixCount++;
|
prefixCount++;
|
||||||
|
|
||||||
|
// If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
|
||||||
|
// are now writing the full aaa* prefix term, we include it here:
|
||||||
|
if (skippedEmptyStringSuffix) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Don't add a prefix term for all terms in the index!
|
// Don't add a prefix term for all terms in the index!
|
||||||
}
|
}
|
||||||
|
@ -384,16 +413,8 @@ class AutoPrefixTermsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove slice from the top of the pending stack, that we just wrote:
|
// Remove slice from the top of the pending stack, that we just wrote:
|
||||||
int sizeToClear = count;
|
|
||||||
if (prefixCount > 1) {
|
pending.subList(pending.size()-count, pending.size()).clear();
|
||||||
Object o = pending.get(pending.size()-count);
|
|
||||||
if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
|
|
||||||
// If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
|
|
||||||
// as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
|
|
||||||
sizeToClear--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pending.subList(pending.size()-sizeToClear, pending.size()).clear();
|
|
||||||
|
|
||||||
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
|
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
|
||||||
for(int i=0;i<prefixCount;i++) {
|
for(int i=0;i<prefixCount;i++) {
|
||||||
|
@ -410,6 +431,8 @@ class AutoPrefixTermsWriter {
|
||||||
|
|
||||||
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
|
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
|
||||||
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
|
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
|
||||||
|
//if (DEBUG) System.out.println(" savePrefix: " + pt);
|
||||||
|
|
||||||
prefixes.add(pt);
|
prefixes.add(pt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue