LUCENE-5133: allow highlight to Object for AnalyzingInfixSuggester for advanced use cases

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522667 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-09-12 17:14:27 +00:00
parent d06ba3caaf
commit 534a9f3080
4 changed files with 179 additions and 34 deletions

View File

@ -53,6 +53,11 @@ New Features
for advanced use cases where String is too restrictive (Luca for advanced use cases where String is too restrictive (Luca
Cavanna, Robert Muir, Mike McCandless) Cavanna, Robert Muir, Mike McCandless)
* LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return
Object instead of String, to allow for advanced use cases where
String is too restrictive (Robert Muir, Shai Erera, Mike
McCandless)
Changes in backwards compatibility policy Changes in backwards compatibility policy
* LUCENE-5204: Directory doesn't have default implementations for * LUCENE-5204: Directory doesn't have default implementations for

View File

@ -41,6 +41,10 @@ public abstract class Lookup {
/** the key's text */ /** the key's text */
public final CharSequence key; public final CharSequence key;
/** Expert: custom Object to hold the result of a
* highlighted suggestion. */
public final Object highlightKey;
/** the key's weight */ /** the key's weight */
public final long value; public final long value;
@ -59,6 +63,17 @@ public abstract class Lookup {
*/ */
public LookupResult(CharSequence key, long value, BytesRef payload) { public LookupResult(CharSequence key, long value, BytesRef payload) {
this.key = key; this.key = key;
this.highlightKey = null;
this.value = value;
this.payload = payload;
}
/**
* Create a new result from a key+highlightKey+weight+payload triple.
*/
public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) {
this.key = key;
this.highlightKey = highlightKey;
this.value = value; this.value = value;
this.payload = payload; this.payload = payload;
} }

View File

@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator; import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
/** Field name used for the indexed text. */ /** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text"; protected final static String TEXT_FIELD_NAME = "text";
private final Analyzer queryAnalyzer; /** Analyzer used at search time */
final Analyzer indexAnalyzer; protected final Analyzer queryAnalyzer;
/** Analyzer used at index time */
protected final Analyzer indexAnalyzer;
final Version matchVersion; final Version matchVersion;
private final File indexPath; private final File indexPath;
final int minPrefixChars; final int minPrefixChars;
@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
ScoreDoc sd = hits.scoreDocs[i]; ScoreDoc sd = hits.scoreDocs[i];
textDV.get(sd.doc, scratch); textDV.get(sd.doc, scratch);
String text = scratch.utf8ToString(); String text = scratch.utf8ToString();
if (doHighlight) {
text = highlight(text, matchedTokens, prefixToken);
}
long score = weightsDV.get(sd.doc); long score = weightsDV.get(sd.doc);
BytesRef payload; BytesRef payload;
@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
payload = null; payload = null;
} }
results.add(new LookupResult(text, score, payload)); LookupResult result;
if (doHighlight) {
Object highlightKey = highlight(text, matchedTokens, prefixToken);
result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
} else {
result = new LookupResult(text, score, payload);
}
results.add(result);
} }
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
//System.out.println(results); //System.out.println(results);
@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
return in; return in;
} }
private String highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException { /** Override this method to customize the Object
* representing a single highlighted suggestions; the
* result is set on each {@link
* LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text)); TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
int startOffset = offsetAtt.startOffset(); int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset(); int endOffset = offsetAtt.endOffset();
if (upto < startOffset) { if (upto < startOffset) {
sb.append(text.substring(upto, startOffset)); addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset; upto = startOffset;
} else if (upto > startOffset) { } else if (upto > startOffset) {
continue; continue;
@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
ts.end(); ts.end();
int endOffset = offsetAtt.endOffset(); int endOffset = offsetAtt.endOffset();
if (upto < endOffset) { if (upto < endOffset) {
sb.append(text.substring(upto)); addNonMatch(sb, text.substring(upto));
} }
ts.close(); ts.close();
return sb.toString(); return sb.toString();
} }
/** Appends the whole matched token to the provided {@code /** Called while highlighting a single result, to append a
* StringBuilder}. */ * non-matching chunk of text from the suggestion to the
* provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param text The text chunk to add
*/
protected void addNonMatch(StringBuilder sb, String text) {
sb.append(text);
}
/** Called while highlighting a single result, to append
* the whole matched token to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The surface form (original) text
* @param analyzed The analyzed token corresponding to the surface form text
*/
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
sb.append("<b>"); sb.append("<b>");
sb.append(surface); sb.append(surface);
sb.append("</b>"); sb.append("</b>");
} }
/** Append a matched prefix token, to the provided /** Called while highlighting a single result, to append a
* {@code StringBuilder}. * matched prefix token, to the provided fragments list.
* @param sb {@code StringBuilder} to append to * @param sb The {@code StringBuilder} to append to
* @param surface The fragment of the surface form * @param surface The fragment of the surface form
* (indexed during {@link #build}, corresponding to * (indexed during {@link #build}, corresponding to
* this match * this match
@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
// TODO: apps can try to invert their analysis logic // TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix: // here, e.g. downcase the two before checking prefix:
sb.append("<b>"); sb.append("<b>");
if (surface.startsWith(prefixToken)) { sb.append(surface.substring(0, prefixToken.length()));
sb.append(surface.substring(0, prefixToken.length())); sb.append("</b>");
sb.append("</b>"); if (prefixToken.length() < surface.length()) {
sb.append(surface.substring(prefixToken.length())); sb.append(surface.substring(prefixToken.length()));
} else {
sb.append(surface);
sb.append("</b>");
} }
} }

View File

@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest.analyzing;
*/ */
import java.io.File; import java.io.File;
import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreqPayload; import org.apache.lucene.search.suggest.TermFreqPayload;
@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
suggester.close(); suggester.close();
} }
/** Used to return highlighted result; see {@link
* LookupResult#highlightKey} */
private static final class LookupHighlightFragment {
/** Portion of text for this fragment. */
public final String text;
/** True if this text matched a part of the user's
* query. */
public final boolean isHit;
/** Sole constructor. */
public LookupHighlightFragment(String text, boolean isHit) {
this.text = text;
this.isHit = isHit;
}
@Override
public String toString() {
return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
}
}
@SuppressWarnings("unchecked")
public void testHighlightAsObject() throws Exception {
TermFreqPayload keys[] = new TermFreqPayload[] {
new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")),
};
File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
@Override
protected Directory getDirectory(File path) {
return newDirectory();
}
@Override
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
if (prefixToken.length() < token.length()) {
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
}
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
}
ts.close();
return fragments;
}
};
suggester.build(new TermFreqPayloadArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true);
assertEquals(1, results.size());
assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
assertEquals(10, results.get(0).value);
assertEquals(new BytesRef("foobaz"), results.get(0).payload);
suggester.close();
}
public String toString(List<LookupHighlightFragment> fragments) {
StringBuilder sb = new StringBuilder();
for(LookupHighlightFragment fragment : fragments) {
if (fragment.isHit) {
sb.append("<b>");
}
sb.append(fragment.text);
if (fragment.isHit) {
sb.append("</b>");
}
}
return sb.toString();
}
public void testRandomMinPrefixLength() throws Exception { public void testRandomMinPrefixLength() throws Exception {
TermFreqPayload keys[] = new TermFreqPayload[] { TermFreqPayload keys[] = new TermFreqPayload[] {
new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")), new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
suggester.build(new TermFreqPayloadArrayIterator(keys)); suggester.build(new TermFreqPayloadArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true); List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
assertEquals(1, results.size()); assertEquals(1, results.size());
assertEquals("a <b>Penny</b> saved is a <b>penn</b>y earned", results.get(0).key); assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
suggester.close(); suggester.close();
// Try again, but overriding addPrefixMatch to normalize case: // Try again, but overriding addPrefixMatch to highlight
// the entire hit:
suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) { suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
@Override @Override
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) { protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
prefixToken = prefixToken.toLowerCase(Locale.ROOT);
String surfaceLower = surface.toLowerCase(Locale.ROOT);
sb.append("<b>"); sb.append("<b>");
if (surfaceLower.startsWith(prefixToken)) { sb.append(surface);
sb.append(surface.substring(0, prefixToken.length())); sb.append("</b>");
sb.append("</b>");
sb.append(surface.substring(prefixToken.length()));
} else {
sb.append(surface);
sb.append("</b>");
}
} }
@Override @Override
@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
suggester.build(new TermFreqPayloadArrayIterator(keys)); suggester.build(new TermFreqPayloadArrayIterator(keys));
results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true); results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
assertEquals(1, results.size()); assertEquals(1, results.size());
assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key); assertEquals("a <b>Penny</b> saved is a <b>penny</b> earned", results.get(0).key);
suggester.close(); suggester.close();
} }