LUCENE-6392 Highlighter: add maxStartOffset (and other memory saving efficiencies) to TokenStreamFromTermVector.

Delaying TokenSources changes for LUCENE-6445.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675504 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2015-04-22 21:35:16 +00:00
parent af6fe4d174
commit d0a159e5d7
3 changed files with 52 additions and 21 deletions

View File

@ -86,6 +86,9 @@ Optimizations
* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
they are needed. (Robert Muir)
* LUCENE-6392: Highligher- reduce memory of tokens in
TokenStreamFromTermVector, and add maxStartOffset limit. (David Smiley)
Bug Fixes
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.

View File

@ -127,7 +127,7 @@ public class TokenSources {
// highlighters require offsets, so we insist here.
}
return new TokenStreamFromTermVector(tpv);
return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
}
/**

View File

@ -32,6 +32,7 @@ import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
@ -52,8 +53,6 @@ import org.apache.lucene.util.UnicodeUtil;
*/
public final class TokenStreamFromTermVector extends TokenStream {
//TODO add a maxStartOffset filter, which highlighters will find handy
//This attribute factory uses less memory when captureState() is called.
public static final AttributeFactory ATTRIBUTE_FACTORY =
AttributeFactory.getStaticImplementation(
@ -65,9 +64,14 @@ public final class TokenStreamFromTermVector extends TokenStream {
private final PositionIncrementAttribute positionIncrementAttribute;
private final int maxStartOffset;
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
private CharsRefBuilder termCharsBuilder;//term data here
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
@ -78,13 +82,16 @@ public final class TokenStreamFromTermVector extends TokenStream {
private boolean initialized = false;//lazy
/**
* Constructor.
*
* Constructor. The uninversion doesn't happen here; it's delayed till the first call to
* {@link #incrementToken}.
*
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and/or offsets.
* @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
*/
public TokenStreamFromTermVector(Terms vector) throws IOException {
public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
super(ATTRIBUTE_FACTORY);
this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
@ -106,15 +113,22 @@ public final class TokenStreamFromTermVector extends TokenStream {
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
@ -124,14 +138,17 @@ public final class TokenStreamFromTermVector extends TokenStream {
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
final char[] termChars = new char[termBytesRef.length];
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
@ -139,11 +156,14 @@ public final class TokenStreamFromTermVector extends TokenStream {
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termChars = termChars;
token.termCharsLen = termCharsLen;
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
token.endOffset = dpEnum.endOffset();
if (token.startOffset > maxStartOffset) {
continue;//filter this token out; exceeds threshold
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
pos = token.startOffset >> 3;//divide by 8
}
@ -216,8 +236,8 @@ public final class TokenStreamFromTermVector extends TokenStream {
}
private TokenLL[] initTokensArray() throws IOException {
// Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
// that reduce the likelihood of needing to expand the array.
// Estimate the number of position slots we need from term stats. We use some estimation factors taken from
// Wikipedia that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
int size = (int) vector.size();
@ -227,7 +247,12 @@ public final class TokenStreamFromTermVector extends TokenStream {
sumTotalTermFreq = (int)(size * 2.4);
}
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
return new TokenLL[originalPositionEstimate];
// This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
// Take the smaller of the two estimates, but no smaller than 64
return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
}
@Override
@ -247,10 +272,10 @@ public final class TokenStreamFromTermVector extends TokenStream {
return false;
}
clearAttributes();
termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null) {
if (incrementToken.payloadIndex == -1) {
@ -263,11 +288,14 @@ public final class TokenStreamFromTermVector extends TokenStream {
}
private static class TokenLL {
char[] termChars;
int termCharsLen;
// This class should weigh 32 bytes, including object header
int termCharsOff; // see termCharsBuilder
short termCharsLen;
int positionIncrement;
int startOffset;
int endOffset;
short endOffsetInc; // add to startOffset to get endOffset
int payloadIndex;
TokenLL next;
@ -297,7 +325,7 @@ public final class TokenStreamFromTermVector extends TokenStream {
int compareOffsets(TokenLL tokenB) {
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
cmp = Integer.compare(this.endOffset, tokenB.endOffset);
cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
}
return cmp;
}