LUCENE-6392 Highlighter: add maxStartOffset (and other memory saving efficiencies) to TokenStreamFromTermVector.

Delaying TokenSources changes for LUCENE-6445. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675504 13f79535-47bb-0310-9956-ffa450edef68
2015-04-22 21:35:16 +00:00 · 2015-04-22 21:35:16 +00:00 · d0a159e5d7
parent af6fe4d174
commit d0a159e5d7
3 changed files with 52 additions and 21 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,6 +86,9 @@ Optimizations
 * LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
  they are needed. (Robert Muir)

+* LUCENE-6392: Highligher- reduce memory of tokens in
+  TokenStreamFromTermVector, and add maxStartOffset limit. (David Smiley)
+
 Bug Fixes

 * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@ -127,7 +127,7 @@ public class TokenSources {
      // highlighters require offsets, so we insist here.
    }

-    return new TokenStreamFromTermVector(tpv);
+    return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
  }

  /**
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
@ -32,6 +32,7 @@ import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefArray;
 import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.UnicodeUtil;

@ -52,8 +53,6 @@ import org.apache.lucene.util.UnicodeUtil;
 */
 public final class TokenStreamFromTermVector extends TokenStream {

-  //TODO add a maxStartOffset filter, which highlighters will find handy
-
  //This attribute factory uses less memory when captureState() is called.
  public static final AttributeFactory ATTRIBUTE_FACTORY =
      AttributeFactory.getStaticImplementation(
@ -65,9 +64,14 @@ public final class TokenStreamFromTermVector extends TokenStream {

  private final PositionIncrementAttribute positionIncrementAttribute;

+  private final int maxStartOffset;
+
  private OffsetAttribute offsetAttribute;//maybe null

  private PayloadAttribute payloadAttribute;//maybe null
+
+  private CharsRefBuilder termCharsBuilder;//term data here
+
  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

@ -78,13 +82,16 @@ public final class TokenStreamFromTermVector extends TokenStream {
  private boolean initialized = false;//lazy

  /**
-   * Constructor.
-   * 
+   * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
+   * {@link #incrementToken}.
+   *
   * @param vector Terms that contains the data for
   *        creating the TokenStream. Must have positions and/or offsets.
+   * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
   */
-  public TokenStreamFromTermVector(Terms vector) throws IOException {
+  public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
    super(ATTRIBUTE_FACTORY);
+    this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
    if (!vector.hasPositions() && !vector.hasOffsets()) {
      throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
@ -106,15 +113,22 @@ public final class TokenStreamFromTermVector extends TokenStream {
  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
  private void init() throws IOException {
    assert !initialized;
+    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
+      dpEnumFlags |= PostingsEnum.OFFSETS;
      offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
+      dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
      payloadAttribute = getAttribute(PayloadAttribute.class);
      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
      spareBytesRefBuilder = new BytesRefBuilder();
    }

+    // We put term data here
+    termCharsBuilder = new CharsRefBuilder();
+    termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
+
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position

    TokenLL[] positionedTokens = initTokensArray();
@ -124,14 +138,17 @@ public final class TokenStreamFromTermVector extends TokenStream {
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
+    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
      //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
      // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
-      final char[] termChars = new char[termBytesRef.length];
-      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
+      tempCharsRefBuilder.grow(termBytesRef.length);
+      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
+      final int termCharsOff = termCharsBuilder.length();
+      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);

-      dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
+      dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
      assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
      dpEnum.nextDoc();
      final int freq = dpEnum.freq();
@ -139,11 +156,14 @@ public final class TokenStreamFromTermVector extends TokenStream {
      for (int j = 0; j < freq; j++) {
        int pos = dpEnum.nextPosition();
        TokenLL token = new TokenLL();
-        token.termChars = termChars;
-        token.termCharsLen = termCharsLen;
+        token.termCharsOff = termCharsOff;
+        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
        if (offsetAttribute != null) {
          token.startOffset = dpEnum.startOffset();
-          token.endOffset = dpEnum.endOffset();
+          if (token.startOffset > maxStartOffset) {
+            continue;//filter this token out; exceeds threshold
+          }
+          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
          if (pos == -1) {
            pos = token.startOffset >> 3;//divide by 8
          }
@ -216,8 +236,8 @@ public final class TokenStreamFromTermVector extends TokenStream {
  }

  private TokenLL[] initTokensArray() throws IOException {
-    // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
-    //  that reduce the likelihood of needing to expand the array.
+    // Estimate the number of position slots we need from term stats.  We use some estimation factors taken from
+    //  Wikipedia that reduce the likelihood of needing to expand the array.
    int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
    if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
      int size = (int) vector.size();
@ -227,7 +247,12 @@ public final class TokenStreamFromTermVector extends TokenStream {
      sumTotalTermFreq = (int)(size * 2.4);
    }
    final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
-    return new TokenLL[originalPositionEstimate];
+
+    // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
+    final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
+
+    // Take the smaller of the two estimates, but no smaller than 64
+    return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
  }

  @Override
@ -247,10 +272,10 @@ public final class TokenStreamFromTermVector extends TokenStream {
      return false;
    }
    clearAttributes();
-    termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
+    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
    positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
    if (offsetAttribute != null) {
-      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
+      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
    }
    if (payloadAttribute != null) {
      if (incrementToken.payloadIndex == -1) {
@ -263,11 +288,14 @@ public final class TokenStreamFromTermVector extends TokenStream {
  }

  private static class TokenLL {
-    char[] termChars;
-    int termCharsLen;
+    // This class should weigh 32 bytes, including object header
+
+    int termCharsOff; // see termCharsBuilder
+    short termCharsLen;
+
    int positionIncrement;
    int startOffset;
-    int endOffset;
+    short endOffsetInc; // add to startOffset to get endOffset
    int payloadIndex;

    TokenLL next;
@ -297,7 +325,7 @@ public final class TokenStreamFromTermVector extends TokenStream {
    int compareOffsets(TokenLL tokenB) {
      int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
      if (cmp == 0) {
-        cmp = Integer.compare(this.endOffset, tokenB.endOffset);
+        cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
      }
      return cmp;
    }