LUCENE-5133: allow highlight to Object for AnalyzingInfixSuggester for advanced use cases

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522667 13f79535-47bb-0310-9956-ffa450edef68
2013-09-12 17:14:27 +00:00 · 2013-09-12 17:14:27 +00:00 · 534a9f3080
parent d06ba3caaf
commit 534a9f3080
4 changed files with 179 additions and 34 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -53,6 +53,11 @@ New Features
  for advanced use cases where String is too restrictive (Luca
  Cavanna, Robert Muir, Mike McCandless)
 * LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return
  Object instead of String, to allow for advanced use cases where
  String is too restrictive (Robert Muir, Shai Erera, Mike
  McCandless)
 Changes in backwards compatibility policy
 * LUCENE-5204: Directory doesn't have default implementations for
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
@ -41,6 +41,10 @@ public abstract class Lookup {
    /** the key's text */
    public final CharSequence key;
    /** Expert: custom Object to hold the result of a
     *  highlighted suggestion. */
    public final Object highlightKey;
    /** the key's weight */
    public final long value;
@ -59,6 +63,17 @@ public abstract class Lookup {
     */
    public LookupResult(CharSequence key, long value, BytesRef payload) {
      this.key = key;
      this.highlightKey = null;
      this.value = value;
      this.payload = payload;
    }
    /**
     * Create a new result from a key+highlightKey+weight+payload triple.
     */
    public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) {
      this.key = key;
      this.highlightKey = highlightKey;
      this.value = value;
      this.payload = payload;
    }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spell.TermFreqIterator;
 import org.apache.lucene.search.spell.TermFreqPayloadIterator;
 import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
  /** Field name used for the indexed text. */
  protected final static String TEXT_FIELD_NAME = "text";
-  private final Analyzer queryAnalyzer;
+  /** Analyzer used at search time */
-  final Analyzer indexAnalyzer;
+  protected final Analyzer queryAnalyzer;
  /** Analyzer used at index time */
  protected final Analyzer indexAnalyzer;
  final Version matchVersion;
  private final File indexPath;
  final int minPrefixChars;
@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
        ScoreDoc sd = hits.scoreDocs[i];
        textDV.get(sd.doc, scratch);
        String text = scratch.utf8ToString();
        if (doHighlight) {
          text = highlight(text, matchedTokens, prefixToken);
        }
        long score = weightsDV.get(sd.doc);
        BytesRef payload;
@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
          payload = null;
        }
-        results.add(new LookupResult(text, score, payload));
+        LookupResult result;
        if (doHighlight) {
          Object highlightKey = highlight(text, matchedTokens, prefixToken);
          result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
        } else {
          result = new LookupResult(text, score, payload);
        }
        results.add(result);
      }
      //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
      //System.out.println(results);
@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
    return in;
  }
-  private String highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
+  /** Override this method to customize the Object
   *  representing a single highlighted suggestions; the
   *  result is set on each {@link
   *  LookupResult#highlightKey} member. */
  protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
    TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
      int startOffset = offsetAtt.startOffset();
      int endOffset = offsetAtt.endOffset();
      if (upto < startOffset) {
-        sb.append(text.substring(upto, startOffset));
+        addNonMatch(sb, text.substring(upto, startOffset));
        upto = startOffset;
      } else if (upto > startOffset) {
        continue;
@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
    ts.end();
    int endOffset = offsetAtt.endOffset();
    if (upto < endOffset) {
-      sb.append(text.substring(upto));
+      addNonMatch(sb, text.substring(upto));
    }
    ts.close();
    return sb.toString();
  }
-  /** Appends the whole matched token to the provided {@code
+  /** Called while highlighting a single result, to append a
-   *  StringBuilder}. */
+   *  non-matching chunk of text from the suggestion to the
   *  provided fragments list.
   *  @param sb The {@code StringBuilder} to append to
   *  @param text The text chunk to add
   */
  protected void addNonMatch(StringBuilder sb, String text) {
    sb.append(text);
  }
  /** Called while highlighting a single result, to append
   *  the whole matched token to the provided fragments list.
   *  @param sb The {@code StringBuilder} to append to
   *  @param surface The surface form (original) text
   *  @param analyzed The analyzed token corresponding to the surface form text
   */
  protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
    sb.append("<b>");
    sb.append(surface);
    sb.append("</b>");
  }
-  /** Append a matched prefix token, to the provided
+  /** Called while highlighting a single result, to append a
-   *  {@code StringBuilder}. 
+   *  matched prefix token, to the provided fragments list.
-   *  @param sb {@code StringBuilder} to append to
+   *  @param sb The {@code StringBuilder} to append to
   *  @param surface The fragment of the surface form
   *        (indexed during {@link #build}, corresponding to
   *        this match
@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
    // TODO: apps can try to invert their analysis logic
    // here, e.g. downcase the two before checking prefix:
    sb.append("<b>");
-    if (surface.startsWith(prefixToken)) {
+    sb.append(surface.substring(0, prefixToken.length()));
-      sb.append(surface.substring(0, prefixToken.length()));
+    sb.append("</b>");
-      sb.append("</b>");
+    if (prefixToken.length() < surface.length()) {
      sb.append(surface.substring(prefixToken.length()));
    } else {
      sb.append(surface);
      sb.append("</b>");
    }
  }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest.analyzing;
 */
 import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Locale;
+import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.TermFreqPayload;
@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
    suggester.close();
  }
  /** Used to return highlighted result; see {@link
   *  LookupResult#highlightKey} */
  private static final class LookupHighlightFragment {
    /** Portion of text for this fragment. */
    public final String text;
    /** True if this text matched a part of the user's
     *  query. */
    public final boolean isHit;
    /** Sole constructor. */
    public LookupHighlightFragment(String text, boolean isHit) {
      this.text = text;
      this.isHit = isHit;
    }
    @Override
    public String toString() {
      return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
    }
  }
  @SuppressWarnings("unchecked")
  public void testHighlightAsObject() throws Exception {
    TermFreqPayload keys[] = new TermFreqPayload[] {
      new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")),
    };
    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
        @Override
        protected Directory getDirectory(File path) {
          return newDirectory();
        }
        @Override
        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
          TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
          OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
          ts.reset();
          List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
          int upto = 0;
          while (ts.incrementToken()) {
            String token = termAtt.toString();
            int startOffset = offsetAtt.startOffset();
            int endOffset = offsetAtt.endOffset();
            if (upto < startOffset) {
              fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
              upto = startOffset;
            } else if (upto > startOffset) {
              continue;
            }
            if (matchedTokens.contains(token)) {
              // Token matches.
              fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
              upto = endOffset;
            } else if (prefixToken != null && token.startsWith(prefixToken)) {
              fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
              if (prefixToken.length() < token.length()) {
                fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
              }
              upto = endOffset;
            }
          }
          ts.end();
          int endOffset = offsetAtt.endOffset();
          if (upto < endOffset) {
            fragments.add(new LookupHighlightFragment(text.substring(upto), false));
          }
          ts.close();
          return fragments;
        }
      };
    suggester.build(new TermFreqPayloadArrayIterator(keys));
    List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    suggester.close();
  }
  public String toString(List<LookupHighlightFragment> fragments) {
    StringBuilder sb = new StringBuilder();
    for(LookupHighlightFragment fragment : fragments) {
      if (fragment.isHit) {
        sb.append("<b>");
      }
      sb.append(fragment.text);
      if (fragment.isHit) {
        sb.append("</b>");
      }
    }
    return sb.toString();
  }
  public void testRandomMinPrefixLength() throws Exception {
    TermFreqPayload keys[] = new TermFreqPayload[] {
      new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
    suggester.build(new TermFreqPayloadArrayIterator(keys));
    List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
    assertEquals(1, results.size());
-    assertEquals("a <b>Penny</b> saved is a <b>penn</b>y earned", results.get(0).key);
+    assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
    suggester.close();
-    // Try again, but overriding addPrefixMatch to normalize case:
+    // Try again, but overriding addPrefixMatch to highlight
    // the entire hit:
    suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
        @Override
        protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
          prefixToken = prefixToken.toLowerCase(Locale.ROOT);
          String surfaceLower = surface.toLowerCase(Locale.ROOT);
          sb.append("<b>");
-          if (surfaceLower.startsWith(prefixToken)) {
+          sb.append(surface);
-            sb.append(surface.substring(0, prefixToken.length()));
+          sb.append("</b>");
            sb.append("</b>");
            sb.append(surface.substring(prefixToken.length()));
          } else {
            sb.append(surface);
            sb.append("</b>");
          }
        }
        @Override
@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
    suggester.build(new TermFreqPayloadArrayIterator(keys));
    results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
    assertEquals(1, results.size());
-    assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
+    assertEquals("a <b>Penny</b> saved is a <b>penny</b> earned", results.get(0).key);
    suggester.close();
  }