LUCENE-7620: UnifiedHighlighter: new LengthGoalBreakIterator wrapper

2017-01-07 23:10:48 -05:00 · 2017-01-07 23:10:48 -05:00 · ea49989524
parent ac14fc32e0
commit ea49989524
4 changed files with 295 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -218,6 +218,10 @@ Improvements
 * LUCENE-7614: Complex Phrase Query parser ignores double quotes around single token 
  prefix, wildcard, range queries (Mikhail Khludnev)
 * LUCENE-7620: Added LengthGoalBreakIterator, a wrapper around another B.I. to skip breaks
  that would create Passages that are too short.  Only for use with the UnifiedHighlighter
  (and probably PostingsHighlighter).  (David Smiley)
 Optimizations
 * LUCENE-7568: Optimize merging when index sorting is used but the
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
@ -0,0 +1,185 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.search.uhighlight;
 import java.text.BreakIterator;
 import java.text.CharacterIterator;
 /**
 * Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are too
 * short.  It's still possible to get a short passage but only at the very end of the input text.
 * <p>
 * Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in a way
 * compatible with the {@link UnifiedHighlighter}.  Some assumptions are checked with Java assertions.
 *
 * @lucene.experimental
 */
 public class LengthGoalBreakIterator extends BreakIterator {
  private final BreakIterator baseIter;
  private final int lengthGoal;
  private final boolean isMinimumLength; // if false then is "closest to" length
  /** Breaks will be at least {@code minLength} apart (to the extent possible). */
  public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
    return new LengthGoalBreakIterator(baseIter, minLength, true);
  }
  /** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
   * is chosen. */
  public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
    return new LengthGoalBreakIterator(baseIter, targetLength, false);
  }
  private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
    this.baseIter = baseIter;
    this.lengthGoal = lengthGoal;
    this.isMinimumLength = isMinimumLength;
  }
  // note: the only methods that will get called are setText(txt), getText(),
  // getSummaryPassagesNoHighlight: current(), first(), next()
  // highlightOffsetsEnums: preceding(int), and following(int)
  //   Nonetheless we make some attempt to implement the rest; mostly delegating.
  @Override
  public String toString() {
    String goalDesc = isMinimumLength ? "minLen" : "targetLen";
    return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
  }
  @Override
  public Object clone() {
    return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
  }
  @Override
  public CharacterIterator getText() {
    return baseIter.getText();
  }
  @Override
  public void setText(String newText) {
    baseIter.setText(newText);
  }
  @Override
  public void setText(CharacterIterator newText) {
    baseIter.setText(newText);
  }
  @Override
  public int current() {
    return baseIter.current();
  }
  @Override
  public int first() {
    return baseIter.first();
  }
  @Override
  public int last() {
    return baseIter.last();
  }
  @Override
  public int next(int n) {
    assert false : "Not supported";
    return baseIter.next(n); // probably wrong
  }
  // called by getSummaryPassagesNoHighlight to generate default summary.
  @Override
  public int next() {
    return following(current());
  }
  @Override
  public int previous() {
    assert false : "Not supported";
    return baseIter.previous();
  }
  // called while the current position is the start of a new passage; find end of passage
  @Override
  public int following(int followingIdx) {
    final int startIdx = current();
    if (followingIdx < startIdx) {
      assert false : "Not supported";
      return baseIter.following(followingIdx);
    }
    final int targetIdx = startIdx + lengthGoal;
    // When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
    if (followingIdx >= targetIdx - 1) {
      return baseIter.following(followingIdx);
    }
    // If target exceeds the text length, return the last index.
    if (targetIdx >= getText().getEndIndex()) {
      return baseIter.last();
    }
    // Find closest break >= the target
    final int afterIdx = baseIter.following(targetIdx - 1);
    if (afterIdx == DONE) { // we're at the end; can this happen?
      return current();
    }
    if (afterIdx == targetIdx) { // right on the money
      return afterIdx;
    }
    if (isMinimumLength) { // thus never undershoot
      return afterIdx;
    }
    // note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
    // Find closest break < target
    final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
    if (beforeIdx <= followingIdx) { // too far back
      return moveToBreak(afterIdx);
    }
    if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
      return beforeIdx;
    }
    return moveToBreak(afterIdx);
  }
  private int moveToBreak(int idx) { // precondition: idx is a known break
    // bi.isBoundary(idx) has side-effect of moving the position.  Not obvious!
    //boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
    //assert moved && current() == idx;
    // TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
    int current = baseIter.following(idx - 2);
    assert current == idx : "following() didn't move us to the expected index.";
    return idx;
  }
  // called at start of new Passage given first word start offset
  @Override
  public int preceding(int offset) {
    return baseIter.preceding(offset); // no change needed
  }
  @Override
  public boolean isBoundary(int offset) {
    assert false : "Not supported";
    return baseIter.isBoundary(offset);
  }
 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@ -171,6 +171,7 @@ public class Passage {
  /** @lucene.internal */
  public void setEndOffset(int endOffset) {
    assert startOffset <= endOffset;
    this.endOffset = endOffset;
  }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
@ -0,0 +1,104 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.search.uhighlight;
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.QueryBuilder;
 public class LengthGoalBreakIteratorTest extends LuceneTestCase {
  private static final String FIELD = "body";
  // We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
  //  not a general purpose BreakIterator.  A unit test of it directly wouldn't give as much confidence.
  private final Analyzer analyzer =
      new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
  // We do a '.' BreakIterator and test varying the length goal.
  //                      0         1
  //                      01234567890123456789
  final String content = "Aa bb. Cc dd. Ee ff";
  public void testTargetLen() throws IOException {
    // "goal" means target length goal to find closest break
    // at first word:
    Query query = query("aa");
    assertEquals("almost two sent",
        "<b>Aa</b> bb.", highlightClosestToLen(content, query, 9));
    assertEquals( "barely two sent",
        "<b>Aa</b> bb. Cc dd.", highlightClosestToLen(content, query, 10));
    assertEquals("long goal",
        "<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(content, query, 17 + random().nextInt(20)));
    // at some word not at start of passage
    query = query("dd");
    assertEquals("short goal",
        " Cc <b>dd</b>.", highlightClosestToLen(content, query, random().nextInt(5)));
    assertEquals("almost two sent",
        " Cc <b>dd</b>.", highlightClosestToLen(content, query, 10));
    assertEquals("barely two sent",
        " Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 11));
    assertEquals("long goal",
        " Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 12 + random().nextInt(20)));
  }
  public void testMinLen() throws IOException {
    // minLen mode is simpler than targetLen... just test a few cases
    Query query = query("dd");
    assertEquals("almost two sent",
        " Cc <b>dd</b>.", highlightMinLen(content, query, 6));
    assertEquals("barely two sent",
        " Cc <b>dd</b>. Ee ff", highlightMinLen(content, query, 7));
  }
  public void testDefaultSummaryTargetLen() throws IOException {
    Query query = query("zz");
    assertEquals("Aa bb.",
        highlightClosestToLen(content, query, random().nextInt(10))); // < 10
    assertEquals("Aa bb. Cc dd.",
        highlightClosestToLen(content, query, 10 + 6)); // cusp of adding 3rd sentence
    assertEquals("Aa bb. Cc dd. Ee ff",
        highlightClosestToLen(content, query, 17 + random().nextInt(20))); // >= 14
  }
  private Query query(String qStr) {
    return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
  }
  private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException {
    UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
    highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
    return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
  }
  private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException {
    // differs from above only by "createMinLength"
    UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
    highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
    return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
  }
 }