SOLR-6692: hl.maxAnalyzedChars should apply cumulatively on a multi-valued field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673200 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2015-04-13 14:08:07 +00:00
parent c6482c1488
commit f5071289e7
3 changed files with 121 additions and 92 deletions

View File

@ -128,7 +128,11 @@ Other Changes
* SOLR-7384: Fix spurious failures in FullSolrCloudDistribCmdsTest. (shalin)
* SOLR-6692: The default highlighter is much more extensible. (David Smiley)
* SOLR-6692: Default highlighter changes:
- hl.maxAnalyzedChars now applies cumulatively on a multi-valied field.
- fragment ranking on a multi-valued field should be more relevant.
- Much more extensible.
(David Smiley)
================== 5.1.0 ==================

View File

@ -71,44 +71,44 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
*
* @since solr 1.3
*/
public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized
{
public static Logger log = LoggerFactory.getLogger(DefaultSolrHighlighter.class);
protected final SolrCore solrCore;
/** Will be invoked via reflection */
//Will be invoked via reflection
public DefaultSolrHighlighter(SolrCore solrCore) {
this.solrCore = solrCore;
}
// Thread safe registry
protected final Map<String,SolrFormatter> formatters =
new HashMap<>();
new HashMap<>();
// Thread safe registry
protected final Map<String,SolrEncoder> encoders =
new HashMap<>();
new HashMap<>();
// Thread safe registry
protected final Map<String,SolrFragmenter> fragmenters =
new HashMap<>() ;
new HashMap<>() ;
// Thread safe registry
protected final Map<String, SolrFragListBuilder> fragListBuilders =
new HashMap<>() ;
new HashMap<>() ;
// Thread safe registry
protected final Map<String, SolrFragmentsBuilder> fragmentsBuilders =
new HashMap<>() ;
new HashMap<>() ;
// Thread safe registry
protected final Map<String, SolrBoundaryScanner> boundaryScanners =
new HashMap<>() ;
new HashMap<>() ;
@Override
public void init(PluginInfo info) {
@ -143,7 +143,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
if( fragListBuilder == null ) fragListBuilder = new SimpleFragListBuilder();
fragListBuilders.put( "", fragListBuilder );
fragListBuilders.put( null, fragListBuilder );
// Load the FragmentsBuilders
SolrFragmentsBuilder fragsBuilder = solrCore.initPlugins(info.getChildren("fragmentsBuilder"),
fragmentsBuilders, SolrFragmentsBuilder.class, null);
@ -175,12 +175,12 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getSpanQueryScorer(query, fieldName, tokenStream, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
/**
* Return a {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
* @param query The current Query
@ -188,15 +188,15 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
* @param request The current SolrQueryRequest
*/
protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
SolrParams params = request.getParams();
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
/**
* Return a {@link org.apache.lucene.search.highlight.QueryScorer} suitable for this Query and field.
* @param query The current query
@ -221,15 +221,14 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
* @param request The SolrQueryRequest
*/
protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName);
}
else {
return new QueryTermScorer(query);
}
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName);
} else {
return new QueryTermScorer(query);
}
}
/**
* Return the max number of snippets for this field. If this has not
* been configured for this field, fall back to the configured default
@ -238,7 +237,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
* @param params The params controlling Highlighting
*/
protected int getMaxSnippets(String fieldName, SolrParams params) {
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
}
/**
@ -249,17 +248,17 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
protected boolean isMergeContiguousFragments(String fieldName, SolrParams params){
return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
}
/**
* Return a {@link org.apache.lucene.search.highlight.Formatter} appropriate for this field. If a formatter
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link org.apache.lucene.search.highlight.SimpleHTMLFormatter}).
*
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Formatter}.
*/
protected Formatter getFormatter(String fieldName, SolrParams params )
protected Formatter getFormatter(String fieldName, SolrParams params )
{
String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER );
SolrFormatter formatter = formatters.get(str);
@ -273,7 +272,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
* Return an {@link org.apache.lucene.search.highlight.Encoder} appropriate for this field. If an encoder
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link org.apache.lucene.search.highlight.DefaultEncoder}).
*
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Encoder}.
@ -286,17 +285,17 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
return encoder.getEncoder(fieldName, params);
}
/**
* Return a {@link org.apache.lucene.search.highlight.Fragmenter} appropriate for this field. If a fragmenter
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link GapFragmenter}).
*
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Fragmenter}.
*/
protected Fragmenter getFragmenter(String fieldName, SolrParams params)
protected Fragmenter getFragmenter(String fieldName, SolrParams params)
{
String fmt = params.getFieldParam( fieldName, HighlightParams.FRAGMENTER );
SolrFragmenter frag = fragmenters.get( fmt );
@ -305,7 +304,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
return frag.getFragmenter(fieldName, params);
}
protected FragListBuilder getFragListBuilder( String fieldName, SolrParams params ){
String flb = params.getFieldParam( fieldName, HighlightParams.FRAG_LIST_BUILDER );
SolrFragListBuilder solrFlb = fragListBuilders.get(flb);
@ -314,7 +313,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
return solrFlb.getFragListBuilder(params);
}
protected FragmentsBuilder getFragmentsBuilder( String fieldName, SolrParams params ){
BoundaryScanner bs = getBoundaryScanner(fieldName, params);
return getSolrFragmentsBuilder( fieldName, params ).getFragmentsBuilder(params, bs);
@ -322,7 +321,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
protected SolrFragmentsBuilder getSolrFragmentsBuilder( String fieldName, SolrParams params ){
String fb = params.getFieldParam( fieldName, HighlightParams.FRAGMENTS_BUILDER );
SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb);
SolrFragmentsBuilder solrFb = fragmentsBuilders.get( fb );
if( solrFb == null ){
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb );
}
@ -337,7 +336,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
return solrBs.getBoundaryScanner(fieldName, params);
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents, or returns null if highlighting is disabled.
@ -353,23 +352,23 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
@Override
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params))
return null;
return null;
SolrIndexSearcher searcher = req.getSearcher();
IndexSchema schema = searcher.getSchema();
NamedList fragments = new SimpleOrderedMap();
String[] fieldNames = getHighlightFields(query, req, defaultFields);
Set<String> fset = new HashSet<>();
{
// pre-fetch documents using the Searcher's doc cache
for(String f : fieldNames) { fset.add(f); }
// fetch unique key if one exists.
SchemaField keyField = schema.getUniqueKeyField();
if(null != keyField)
fset.add(keyField.getName());
fset.add(keyField.getName());
}
// get FastVectorHighlighter instance out of the processing loop
@ -399,7 +398,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
return fragments;
}
/*
* If fieldName is undefined, this method returns false, then
* doHighlightingByHighlighter() will do nothing for the field.
@ -438,7 +437,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
@SuppressWarnings("unchecked")
protected void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries,
int docId, StoredDocument doc, String fieldName ) throws IOException {
int docId, StoredDocument doc, String fieldName ) throws IOException {
final SolrIndexSearcher searcher = req.getSearcher();
final IndexSchema schema = searcher.getSchema();
final SolrParams params = req.getParams();
@ -538,6 +537,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
@ -594,6 +594,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
String value = thisField.stringValue();
result.add(value);
maxCharsToAnalyze -= value.length();//we exit early if we'll never get to analyze the value
maxValues--;
if (maxValues <= 0 || maxCharsToAnalyze <= 0) {
break;
@ -653,7 +654,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
}
}
}
protected TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException {
return new TokenOrderingFilter(schema.getIndexAnalyzer().tokenStream(fieldName, docText), 10);
}
@ -669,7 +670,7 @@ final class TokenOrderingFilter extends TokenFilter {
private final LinkedList<OrderedToken> queue = new LinkedList<>(); //TODO replace with Deque, Array impl
private boolean done=false;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;

View File

@ -86,8 +86,8 @@ public class HighlighterTest extends SolrTestCaseJ4 {
SolrFragmenter regex = highlighter.fragmenters.get( "regex" );
SolrFragmenter frag = highlighter.fragmenters.get( null );
assertSame( gap, frag );
assertTrue( gap instanceof GapFragmenter );
assertTrue( regex instanceof RegexFragmenter );
assertTrue(gap instanceof GapFragmenter);
assertTrue(regex instanceof RegexFragmenter);
}
@Test
@ -128,13 +128,13 @@ public class HighlighterTest extends SolrTestCaseJ4 {
sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertQ("Merge Contiguous",
sumLRF.makeRequest("t_text:long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str[.='this is some <em>long</em> text. It has']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' the word <em>long</em> in many places. In fact, it has']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' <em>long</em> on some different fragments. Let us']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' see what happens to <em>long</em> in this case.']"
);
sumLRF.makeRequest("t_text:long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str[.='this is some <em>long</em> text. It has']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' the word <em>long</em> in many places. In fact, it has']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' <em>long</em> on some different fragments. Let us']",
"//lst[@name='1']/arr[@name='t_text']/str[.=' see what happens to <em>long</em> in this case.']"
);
}
@Test
@ -269,10 +269,10 @@ public class HighlighterTest extends SolrTestCaseJ4 {
assertU(commit());
assertU(optimize());
assertQ("Basic summarization",
sumLRF.makeRequest("long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='tv_text']/str"
);
sumLRF.makeRequest("long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='tv_text']/str"
);
// try the same thing without a q param
assertQ("Should not explode...", // q.alt should return everything
@ -292,9 +292,9 @@ public class HighlighterTest extends SolrTestCaseJ4 {
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertU(adoc("textgap", "first entry hasnt queryword",
"textgap", "second entry has queryword long",
"id", "1"));
assertU(adoc("textgap", "first entry hasnt queryword",
"textgap", "second entry has queryword long",
"id", "1"));
assertU(commit());
assertU(optimize());
assertQ("Basic summarization",
@ -357,10 +357,10 @@ public class HighlighterTest extends SolrTestCaseJ4 {
assertU(commit());
assertU(optimize());
assertQ("Basic summarization",
sumLRF.makeRequest("long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str"
);
sumLRF.makeRequest("long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str"
);
}
@ -432,15 +432,15 @@ public class HighlighterTest extends SolrTestCaseJ4 {
sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertQ("Test RequireFieldMatch",
sumLRF.makeRequest("t_text1:random OR t_text2:words"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text1']/str[.='<em>random</em> words for highlighting tests']",
"//lst[@name='1']/arr[@name='t_text2']/str[.='more random <em>words</em> for second field']"
);
sumLRF.makeRequest("t_text1:random OR t_text2:words"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text1']/str[.='<em>random</em> words for highlighting tests']",
"//lst[@name='1']/arr[@name='t_text2']/str[.='more random <em>words</em> for second field']"
);
// test case for un-optimized index
assertU(adoc("t_text1", "random words for highlighting tests", "id", "2",
"t_text2", "more random words for second field"));
"t_text2", "more random words for second field"));
assertU(delI("1"));
assertU(commit());
sumLRF = h.getRequestFactory(
@ -461,7 +461,7 @@ public class HighlighterTest extends SolrTestCaseJ4 {
args.put("hl", "true");
args.put("hl.fl", "t_text");
args.put("hl.simple.pre","<B>");
args.put("hl.simple.post","</B>");
args.put("hl.simple.post", "</B>");
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
@ -469,14 +469,14 @@ public class HighlighterTest extends SolrTestCaseJ4 {
assertU(commit());
assertU(optimize());
assertQ("Basic summarization",
sumLRF.makeRequest("t_text:long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str[.='a <B>long</B> days night']"
);
sumLRF.makeRequest("t_text:long"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='t_text']/str[.='a <B>long</B> days night']"
);
// test a per-field override
args.put("f.t_text.hl.simple.pre","<I>");
args.put("f.t_text.hl.simple.post","</I>");
args.put("f.t_text.hl.simple.pre", "<I>");
args.put("f.t_text.hl.simple.post", "</I>");
sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertQ("Basic summarization",
@ -515,35 +515,59 @@ public class HighlighterTest extends SolrTestCaseJ4 {
args.put("fl", "id score");
args.put("hl", "true");
args.put("hl.snippets", "10");
args.put("hl.fl", "t_text");
final String field = random().nextBoolean() ? "t_text" : "tv_text";
args.put("hl.fl", field);
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertU(adoc("t_text", LONG_TEXT, "id", "1"));
assertU(adoc(field, LONG_TEXT, "id", "1"));
assertU(commit());
assertU(optimize());
assertQ("token at start of text",
sumLRF.makeRequest("t_text:disjoint"),
sumLRF.makeRequest(field + ":disjoint"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[count(str)=1]"
);
args.put("hl.maxAnalyzedChars", "20");
sumLRF = h.getRequestFactory("standard", 0, 200, args);
assertQ("token at end of text",
sumLRF.makeRequest("t_text:disjoint"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1'][not(*)]"
);
sumLRF.makeRequest(field + ":disjoint"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1'][not(*)]"
);
args.put("hl.maxAnalyzedChars", "-1");
sumLRF = h.getRequestFactory("standard", 0, 200, args);
assertQ("token at start of text",
sumLRF.makeRequest("t_text:disjoint"),
sumLRF.makeRequest(field + ":disjoint"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[count(str)=1]"
);
}
// Test multi-valued together with hl.maxAnalyzedChars
@Test
public void testMultiValuedMaxAnalyzedChars() throws Exception {
String shortText = "some short blah blah blah blah";
final String field = random().nextBoolean() ? "tv_mv_text" : "textgap"; // term vecs or not
assertU(adoc(field, shortText,
field, LONG_TEXT,
"id", "1"));
assertU(commit());
assertQ(req("q", field + ":(short OR long)",
"indent", "on",
"hl", "true",
"hl.fl", field,
"hl.snippets", "2",
"hl.maxAnalyzedChars", "8"),
"//lst[@name='highlighting']/lst[@name='1']/arr[count(*)=1]",
"//lst[@name='1']/arr/str[1][.='some <em>short</em>']"
//"//lst[@name='1']/arr/str[2][.='a <em>long</em> days']"
);
}
@Test
public void testRegexFragmenter() {
HashMap<String,String> args = new HashMap<>();