mirror of https://github.com/apache/lucene.git
LUCENE-5133: allow highlight to Object for AnalyzingInfixSuggester for advanced use cases
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522667 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d06ba3caaf
commit
534a9f3080
|
@ -53,6 +53,11 @@ New Features
|
||||||
for advanced use cases where String is too restrictive (Luca
|
for advanced use cases where String is too restrictive (Luca
|
||||||
Cavanna, Robert Muir, Mike McCandless)
|
Cavanna, Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return
|
||||||
|
Object instead of String, to allow for advanced use cases where
|
||||||
|
String is too restrictive (Robert Muir, Shai Erera, Mike
|
||||||
|
McCandless)
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
||||||
* LUCENE-5204: Directory doesn't have default implementations for
|
* LUCENE-5204: Directory doesn't have default implementations for
|
||||||
|
|
|
@ -41,6 +41,10 @@ public abstract class Lookup {
|
||||||
/** the key's text */
|
/** the key's text */
|
||||||
public final CharSequence key;
|
public final CharSequence key;
|
||||||
|
|
||||||
|
/** Expert: custom Object to hold the result of a
|
||||||
|
* highlighted suggestion. */
|
||||||
|
public final Object highlightKey;
|
||||||
|
|
||||||
/** the key's weight */
|
/** the key's weight */
|
||||||
public final long value;
|
public final long value;
|
||||||
|
|
||||||
|
@ -59,6 +63,17 @@ public abstract class Lookup {
|
||||||
*/
|
*/
|
||||||
public LookupResult(CharSequence key, long value, BytesRef payload) {
|
public LookupResult(CharSequence key, long value, BytesRef payload) {
|
||||||
this.key = key;
|
this.key = key;
|
||||||
|
this.highlightKey = null;
|
||||||
|
this.value = value;
|
||||||
|
this.payload = payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new result from a key+highlightKey+weight+payload triple.
|
||||||
|
*/
|
||||||
|
public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) {
|
||||||
|
this.key = key;
|
||||||
|
this.highlightKey = highlightKey;
|
||||||
this.value = value;
|
this.value = value;
|
||||||
this.payload = payload;
|
this.payload = payload;
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||||
|
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
|
||||||
import org.apache.lucene.search.suggest.Lookup;
|
import org.apache.lucene.search.suggest.Lookup;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
/** Field name used for the indexed text. */
|
/** Field name used for the indexed text. */
|
||||||
protected final static String TEXT_FIELD_NAME = "text";
|
protected final static String TEXT_FIELD_NAME = "text";
|
||||||
|
|
||||||
private final Analyzer queryAnalyzer;
|
/** Analyzer used at search time */
|
||||||
final Analyzer indexAnalyzer;
|
protected final Analyzer queryAnalyzer;
|
||||||
|
/** Analyzer used at index time */
|
||||||
|
protected final Analyzer indexAnalyzer;
|
||||||
final Version matchVersion;
|
final Version matchVersion;
|
||||||
private final File indexPath;
|
private final File indexPath;
|
||||||
final int minPrefixChars;
|
final int minPrefixChars;
|
||||||
|
@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
ScoreDoc sd = hits.scoreDocs[i];
|
ScoreDoc sd = hits.scoreDocs[i];
|
||||||
textDV.get(sd.doc, scratch);
|
textDV.get(sd.doc, scratch);
|
||||||
String text = scratch.utf8ToString();
|
String text = scratch.utf8ToString();
|
||||||
if (doHighlight) {
|
|
||||||
text = highlight(text, matchedTokens, prefixToken);
|
|
||||||
}
|
|
||||||
long score = weightsDV.get(sd.doc);
|
long score = weightsDV.get(sd.doc);
|
||||||
|
|
||||||
BytesRef payload;
|
BytesRef payload;
|
||||||
|
@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
payload = null;
|
payload = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
results.add(new LookupResult(text, score, payload));
|
LookupResult result;
|
||||||
|
|
||||||
|
if (doHighlight) {
|
||||||
|
Object highlightKey = highlight(text, matchedTokens, prefixToken);
|
||||||
|
result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
|
||||||
|
} else {
|
||||||
|
result = new LookupResult(text, score, payload);
|
||||||
|
}
|
||||||
|
results.add(result);
|
||||||
}
|
}
|
||||||
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
|
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
|
||||||
//System.out.println(results);
|
//System.out.println(results);
|
||||||
|
@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
return in;
|
return in;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
/** Override this method to customize the Object
|
||||||
|
* representing a single highlighted suggestions; the
|
||||||
|
* result is set on each {@link
|
||||||
|
* LookupResult#highlightKey} member. */
|
||||||
|
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
int startOffset = offsetAtt.startOffset();
|
int startOffset = offsetAtt.startOffset();
|
||||||
int endOffset = offsetAtt.endOffset();
|
int endOffset = offsetAtt.endOffset();
|
||||||
if (upto < startOffset) {
|
if (upto < startOffset) {
|
||||||
sb.append(text.substring(upto, startOffset));
|
addNonMatch(sb, text.substring(upto, startOffset));
|
||||||
upto = startOffset;
|
upto = startOffset;
|
||||||
} else if (upto > startOffset) {
|
} else if (upto > startOffset) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
ts.end();
|
ts.end();
|
||||||
int endOffset = offsetAtt.endOffset();
|
int endOffset = offsetAtt.endOffset();
|
||||||
if (upto < endOffset) {
|
if (upto < endOffset) {
|
||||||
sb.append(text.substring(upto));
|
addNonMatch(sb, text.substring(upto));
|
||||||
}
|
}
|
||||||
ts.close();
|
ts.close();
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Appends the whole matched token to the provided {@code
|
/** Called while highlighting a single result, to append a
|
||||||
* StringBuilder}. */
|
* non-matching chunk of text from the suggestion to the
|
||||||
|
* provided fragments list.
|
||||||
|
* @param sb The {@code StringBuilder} to append to
|
||||||
|
* @param text The text chunk to add
|
||||||
|
*/
|
||||||
|
protected void addNonMatch(StringBuilder sb, String text) {
|
||||||
|
sb.append(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Called while highlighting a single result, to append
|
||||||
|
* the whole matched token to the provided fragments list.
|
||||||
|
* @param sb The {@code StringBuilder} to append to
|
||||||
|
* @param surface The surface form (original) text
|
||||||
|
* @param analyzed The analyzed token corresponding to the surface form text
|
||||||
|
*/
|
||||||
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
|
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
|
||||||
sb.append("<b>");
|
sb.append("<b>");
|
||||||
sb.append(surface);
|
sb.append(surface);
|
||||||
sb.append("</b>");
|
sb.append("</b>");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Append a matched prefix token, to the provided
|
/** Called while highlighting a single result, to append a
|
||||||
* {@code StringBuilder}.
|
* matched prefix token, to the provided fragments list.
|
||||||
* @param sb {@code StringBuilder} to append to
|
* @param sb The {@code StringBuilder} to append to
|
||||||
* @param surface The fragment of the surface form
|
* @param surface The fragment of the surface form
|
||||||
* (indexed during {@link #build}, corresponding to
|
* (indexed during {@link #build}, corresponding to
|
||||||
* this match
|
* this match
|
||||||
|
@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
// TODO: apps can try to invert their analysis logic
|
// TODO: apps can try to invert their analysis logic
|
||||||
// here, e.g. downcase the two before checking prefix:
|
// here, e.g. downcase the two before checking prefix:
|
||||||
sb.append("<b>");
|
sb.append("<b>");
|
||||||
if (surface.startsWith(prefixToken)) {
|
sb.append(surface.substring(0, prefixToken.length()));
|
||||||
sb.append(surface.substring(0, prefixToken.length()));
|
sb.append("</b>");
|
||||||
sb.append("</b>");
|
if (prefixToken.length() < surface.length()) {
|
||||||
sb.append(surface.substring(prefixToken.length()));
|
sb.append(surface.substring(prefixToken.length()));
|
||||||
} else {
|
|
||||||
sb.append(surface);
|
|
||||||
sb.append("</b>");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest.analyzing;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||||
import org.apache.lucene.search.suggest.TermFreqPayload;
|
import org.apache.lucene.search.suggest.TermFreqPayload;
|
||||||
|
@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
||||||
suggester.close();
|
suggester.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Used to return highlighted result; see {@link
|
||||||
|
* LookupResult#highlightKey} */
|
||||||
|
private static final class LookupHighlightFragment {
|
||||||
|
/** Portion of text for this fragment. */
|
||||||
|
public final String text;
|
||||||
|
|
||||||
|
/** True if this text matched a part of the user's
|
||||||
|
* query. */
|
||||||
|
public final boolean isHit;
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
|
public LookupHighlightFragment(String text, boolean isHit) {
|
||||||
|
this.text = text;
|
||||||
|
this.isHit = isHit;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public void testHighlightAsObject() throws Exception {
|
||||||
|
TermFreqPayload keys[] = new TermFreqPayload[] {
|
||||||
|
new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")),
|
||||||
|
};
|
||||||
|
|
||||||
|
File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
|
||||||
|
|
||||||
|
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
|
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
|
||||||
|
@Override
|
||||||
|
protected Directory getDirectory(File path) {
|
||||||
|
return newDirectory();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||||
|
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
|
||||||
|
int upto = 0;
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
String token = termAtt.toString();
|
||||||
|
int startOffset = offsetAtt.startOffset();
|
||||||
|
int endOffset = offsetAtt.endOffset();
|
||||||
|
if (upto < startOffset) {
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
|
||||||
|
upto = startOffset;
|
||||||
|
} else if (upto > startOffset) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchedTokens.contains(token)) {
|
||||||
|
// Token matches.
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
|
||||||
|
upto = endOffset;
|
||||||
|
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
|
||||||
|
if (prefixToken.length() < token.length()) {
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
|
||||||
|
}
|
||||||
|
upto = endOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
int endOffset = offsetAtt.endOffset();
|
||||||
|
if (upto < endOffset) {
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
|
||||||
|
return fragments;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||||
|
|
||||||
|
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true);
|
||||||
|
assertEquals(1, results.size());
|
||||||
|
assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
|
||||||
|
assertEquals(10, results.get(0).value);
|
||||||
|
assertEquals(new BytesRef("foobaz"), results.get(0).payload);
|
||||||
|
suggester.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString(List<LookupHighlightFragment> fragments) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for(LookupHighlightFragment fragment : fragments) {
|
||||||
|
if (fragment.isHit) {
|
||||||
|
sb.append("<b>");
|
||||||
|
}
|
||||||
|
sb.append(fragment.text);
|
||||||
|
if (fragment.isHit) {
|
||||||
|
sb.append("</b>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
public void testRandomMinPrefixLength() throws Exception {
|
public void testRandomMinPrefixLength() throws Exception {
|
||||||
TermFreqPayload keys[] = new TermFreqPayload[] {
|
TermFreqPayload keys[] = new TermFreqPayload[] {
|
||||||
new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
|
new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
|
||||||
|
@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
||||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
|
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
|
||||||
assertEquals(1, results.size());
|
assertEquals(1, results.size());
|
||||||
assertEquals("a <b>Penny</b> saved is a <b>penn</b>y earned", results.get(0).key);
|
assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
|
||||||
suggester.close();
|
suggester.close();
|
||||||
|
|
||||||
// Try again, but overriding addPrefixMatch to normalize case:
|
// Try again, but overriding addPrefixMatch to highlight
|
||||||
|
// the entire hit:
|
||||||
suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
|
suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
|
||||||
@Override
|
@Override
|
||||||
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
|
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
|
||||||
prefixToken = prefixToken.toLowerCase(Locale.ROOT);
|
|
||||||
String surfaceLower = surface.toLowerCase(Locale.ROOT);
|
|
||||||
sb.append("<b>");
|
sb.append("<b>");
|
||||||
if (surfaceLower.startsWith(prefixToken)) {
|
sb.append(surface);
|
||||||
sb.append(surface.substring(0, prefixToken.length()));
|
sb.append("</b>");
|
||||||
sb.append("</b>");
|
|
||||||
sb.append(surface.substring(prefixToken.length()));
|
|
||||||
} else {
|
|
||||||
sb.append(surface);
|
|
||||||
sb.append("</b>");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
||||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||||
results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
|
results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
|
||||||
assertEquals(1, results.size());
|
assertEquals(1, results.size());
|
||||||
assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
|
assertEquals("a <b>Penny</b> saved is a <b>penny</b> earned", results.get(0).key);
|
||||||
suggester.close();
|
suggester.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue