diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 13d10665359..f025d84b589 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -53,6 +53,11 @@ New Features for advanced use cases where String is too restrictive (Luca Cavanna, Robert Muir, Mike McCandless) +* LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return + Object instead of String, to allow for advanced use cases where + String is too restrictive (Robert Muir, Shai Erera, Mike + McCandless) + Changes in backwards compatibility policy * LUCENE-5204: Directory doesn't have default implementations for diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java index f053cf5b224..a1c64d373f0 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java @@ -41,6 +41,10 @@ public abstract class Lookup { /** the key's text */ public final CharSequence key; + /** Expert: custom Object to hold the result of a + * highlighted suggestion. */ + public final Object highlightKey; + /** the key's weight */ public final long value; @@ -59,6 +63,17 @@ public abstract class Lookup { */ public LookupResult(CharSequence key, long value, BytesRef payload) { this.key = key; + this.highlightKey = null; + this.value = value; + this.payload = payload; + } + + /** + * Create a new result from a key+highlightKey+weight+payload triple. + */ + public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) { + this.key = key; + this.highlightKey = highlightKey; this.value = value; this.payload = payload; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index d0b8a20bb90..ae62acb6ff3 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { /** Field name used for the indexed text. */ protected final static String TEXT_FIELD_NAME = "text"; - private final Analyzer queryAnalyzer; - final Analyzer indexAnalyzer; + /** Analyzer used at search time */ + protected final Analyzer queryAnalyzer; + /** Analyzer used at index time */ + protected final Analyzer indexAnalyzer; final Version matchVersion; private final File indexPath; final int minPrefixChars; @@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { ScoreDoc sd = hits.scoreDocs[i]; textDV.get(sd.doc, scratch); String text = scratch.utf8ToString(); - if (doHighlight) { - text = highlight(text, matchedTokens, prefixToken); - } long score = weightsDV.get(sd.doc); BytesRef payload; @@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { payload = null; } - results.add(new LookupResult(text, score, payload)); + LookupResult result; + + if (doHighlight) { + Object highlightKey = highlight(text, matchedTokens, prefixToken); + result = new LookupResult(highlightKey.toString(), highlightKey, score, payload); + } else { + result = new LookupResult(text, score, payload); + } + results.add(result); } //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); //System.out.println(results); @@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { return in; } - private String highlight(String text, Set matchedTokens, String prefixToken) throws IOException { + /** Override this method to customize the Object + * representing a single highlighted suggestions; the + * result is set on each {@link + * LookupResult#highlightKey} member. */ + protected Object highlight(String text, Set matchedTokens, String prefixToken) throws IOException { TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); @@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { - sb.append(text.substring(upto, startOffset)); + addNonMatch(sb, text.substring(upto, startOffset)); upto = startOffset; } else if (upto > startOffset) { continue; @@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { ts.end(); int endOffset = offsetAtt.endOffset(); if (upto < endOffset) { - sb.append(text.substring(upto)); + addNonMatch(sb, text.substring(upto)); } ts.close(); return sb.toString(); } - /** Appends the whole matched token to the provided {@code - * StringBuilder}. */ + /** Called while highlighting a single result, to append a + * non-matching chunk of text from the suggestion to the + * provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param text The text chunk to add + */ + protected void addNonMatch(StringBuilder sb, String text) { + sb.append(text); + } + + /** Called while highlighting a single result, to append + * the whole matched token to the provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param surface The surface form (original) text + * @param analyzed The analyzed token corresponding to the surface form text + */ protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { sb.append(""); sb.append(surface); sb.append(""); } - /** Append a matched prefix token, to the provided - * {@code StringBuilder}. - * @param sb {@code StringBuilder} to append to + /** Called while highlighting a single result, to append a + * matched prefix token, to the provided fragments list. + * @param sb The {@code StringBuilder} to append to * @param surface The fragment of the surface form * (indexed during {@link #build}, corresponding to * this match @@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { // TODO: apps can try to invert their analysis logic // here, e.g. downcase the two before checking prefix: sb.append(""); - if (surface.startsWith(prefixToken)) { - sb.append(surface.substring(0, prefixToken.length())); - sb.append(""); + sb.append(surface.substring(0, prefixToken.length())); + sb.append(""); + if (prefixToken.length() < surface.length()) { sb.append(surface.substring(prefixToken.length())); - } else { - sb.append(surface); - sb.append(""); } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java index c33028e65ac..2b5ce78de4a 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java @@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest.analyzing; */ import java.io.File; +import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; import java.util.List; -import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreqPayload; @@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { suggester.close(); } + /** Used to return highlighted result; see {@link + * LookupResult#highlightKey} */ + private static final class LookupHighlightFragment { + /** Portion of text for this fragment. */ + public final String text; + + /** True if this text matched a part of the user's + * query. */ + public final boolean isHit; + + /** Sole constructor. */ + public LookupHighlightFragment(String text, boolean isHit) { + this.text = text; + this.isHit = isHit; + } + + @Override + public String toString() { + return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")"; + } + } + + @SuppressWarnings("unchecked") + public void testHighlightAsObject() throws Exception { + TermFreqPayload keys[] = new TermFreqPayload[] { + new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")), + }; + + File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest"); + + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) { + @Override + protected Directory getDirectory(File path) { + return newDirectory(); + } + + @Override + protected Object highlight(String text, Set matchedTokens, String prefixToken) throws IOException { + TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + ts.reset(); + List fragments = new ArrayList(); + int upto = 0; + while (ts.incrementToken()) { + String token = termAtt.toString(); + int startOffset = offsetAtt.startOffset(); + int endOffset = offsetAtt.endOffset(); + if (upto < startOffset) { + fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false)); + upto = startOffset; + } else if (upto > startOffset) { + continue; + } + + if (matchedTokens.contains(token)) { + // Token matches. + fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true)); + upto = endOffset; + } else if (prefixToken != null && token.startsWith(prefixToken)) { + fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true)); + if (prefixToken.length() < token.length()) { + fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false)); + } + upto = endOffset; + } + } + ts.end(); + int endOffset = offsetAtt.endOffset(); + if (upto < endOffset) { + fragments.add(new LookupHighlightFragment(text.substring(upto), false)); + } + ts.close(); + + return fragments; + } + }; + suggester.build(new TermFreqPayloadArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true); + assertEquals(1, results.size()); + assertEquals("a penny saved is a penny earned", toString((List) results.get(0).highlightKey)); + assertEquals(10, results.get(0).value); + assertEquals(new BytesRef("foobaz"), results.get(0).payload); + suggester.close(); + } + + public String toString(List fragments) { + StringBuilder sb = new StringBuilder(); + for(LookupHighlightFragment fragment : fragments) { + if (fragment.isHit) { + sb.append(""); + } + sb.append(fragment.text); + if (fragment.isHit) { + sb.append(""); + } + } + + return sb.toString(); + } + public void testRandomMinPrefixLength() throws Exception { TermFreqPayload keys[] = new TermFreqPayload[] { new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")), @@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { suggester.build(new TermFreqPayloadArrayIterator(keys)); List results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true); assertEquals(1, results.size()); - assertEquals("a Penny saved is a penny earned", results.get(0).key); + assertEquals("a Penny saved is a penny earned", results.get(0).key); suggester.close(); - // Try again, but overriding addPrefixMatch to normalize case: + // Try again, but overriding addPrefixMatch to highlight + // the entire hit: suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) { @Override protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) { - prefixToken = prefixToken.toLowerCase(Locale.ROOT); - String surfaceLower = surface.toLowerCase(Locale.ROOT); sb.append(""); - if (surfaceLower.startsWith(prefixToken)) { - sb.append(surface.substring(0, prefixToken.length())); - sb.append(""); - sb.append(surface.substring(prefixToken.length())); - } else { - sb.append(surface); - sb.append(""); - } + sb.append(surface); + sb.append(""); } @Override @@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { suggester.build(new TermFreqPayloadArrayIterator(keys)); results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true); assertEquals(1, results.size()); - assertEquals("a Penny saved is a penny earned", results.get(0).key); + assertEquals("a Penny saved is a penny earned", results.get(0).key); suggester.close(); }