SOLR-4675: Improve PostingsHighlighter integration

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465228 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-04-06 11:48:47 +00:00
parent ef8fc3481f
commit 57bf443834
9 changed files with 134 additions and 119 deletions

View File

@ -125,6 +125,10 @@ New Features
prevent the "best" match from being found if it appears later in the MV list than the
cutoff specified by either of these params. (Erick Erickson)
* SOLR-4675: Improve PostingsSolrHighlighter to support per-field/query-time overrides
and add additional configuration parameters. See the javadocs for more details and
examples. (Robert Muir)
Bug Fixes
----------------------

View File

@ -69,6 +69,30 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
this.solrCore = solrCore;
}
// Thread safe registry
protected final Map<String,SolrFormatter> formatters =
new HashMap<String, SolrFormatter>();
// Thread safe registry
protected final Map<String,SolrEncoder> encoders =
new HashMap<String, SolrEncoder>();
// Thread safe registry
protected final Map<String,SolrFragmenter> fragmenters =
new HashMap<String, SolrFragmenter>() ;
// Thread safe registry
protected final Map<String, SolrFragListBuilder> fragListBuilders =
new HashMap<String, SolrFragListBuilder>() ;
// Thread safe registry
protected final Map<String, SolrFragmentsBuilder> fragmentsBuilders =
new HashMap<String, SolrFragmentsBuilder>() ;
// Thread safe registry
protected final Map<String, SolrBoundaryScanner> boundaryScanners =
new HashMap<String, SolrBoundaryScanner>() ;
@Override
public void init(PluginInfo info) {
formatters.clear();

View File

@ -19,9 +19,7 @@ package org.apache.solr.highlight;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -50,108 +48,57 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <p>
* Example configuration:
* <pre class="prettyprint">
* &lt;requestHandler name="standard" class="solr.StandardRequestHandler"&gt;
* &lt;lst name="defaults"&gt;
* &lt;int name="hl.snippets"&gt;1&lt;/int&gt;
* &lt;str name="hl.tag.pre"&gt;&amp;lt;em&amp;gt;&lt;/str&gt;
* &lt;str name="hl.tag.post"&gt;&amp;lt;/em&amp;gt;&lt;/str&gt;
* &lt;str name="hl.tag.ellipsis"&gt;... &lt;/str&gt;
* &lt;bool name="hl.defaultSummary"&gt;true&lt;/bool&gt;
* &lt;float name="hl.score.k1"&gt;1.2&lt;/float&gt;
* &lt;float name="hl.score.b"&gt;0.75&lt;/float&gt;
* &lt;float name="hl.score.pivot"&gt;87&lt;/float&gt;
* &lt;int name="hl.maxAnalyzedChars"&gt;10000&lt;/int&gt;
* &lt;/lst&gt;
* &lt;/requestHandler&gt;
* </pre>
* ...
* <pre class="prettyprint">
* &lt;searchComponent class="solr.HighlightComponent" name="highlight"&gt;
* &lt;highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter"
* preTag="&amp;lt;em&amp;gt;"
* postTag="&amp;lt;/em&amp;gt;"
* ellipsis="... "
* k1="1.2"
* b="0.75"
* pivot="87"
* maxLength=10000
* summarizeEmpty=true/&gt;
* &lt;highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter"/&gt;
* &lt;/searchComponent&gt;
* </pre>
* <p>
* Notes:
* <ul>
* <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
* <li>hl.fl specifies the field list.
* <li>hl.snippets specifies how many underlying sentence fragments form the resulting snippet.
* <li>hl.q (string) can specify the query
* <li>hl.fl (string) specifies the field list.
* <li>hl.snippets (int) specifies how many underlying sentence fragments form the resulting snippet.
* <li>hl.tag.pre (string) specifies text which appears before a highlighted term.
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
* <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
* </ul>
*
* @lucene.experimental
*/
public class PostingsSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {
protected PostingsHighlighter highlighter;
@Override
public void initalize(SolrConfig config) {}
@Override
public void init(PluginInfo info) {
Map<String,String> attributes = info.attributes;
// scorer parameters: k1/b/pivot
String k1 = attributes.get("k1");
if (k1 == null) {
k1 = "1.2";
}
String b = attributes.get("b");
if (b == null) {
b = "0.75";
}
String pivot = attributes.get("pivot");
if (pivot == null) {
pivot = "87";
}
final PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
// formatter parameters: preTag/postTag/ellipsis
String preTag = attributes.get("preTag");
if (preTag == null) {
preTag = "<em>";
}
String postTag = attributes.get("postTag");
if (postTag == null) {
postTag = "</em>";
}
String ellipsis = attributes.get("ellipsis");
if (ellipsis == null) {
ellipsis = "... ";
}
final PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
String summarizeEmpty = attributes.get("summarizeEmpty");
final boolean summarizeEmptyBoolean;
if (summarizeEmpty == null) {
summarizeEmptyBoolean = true;
} else {
summarizeEmptyBoolean = Boolean.parseBoolean(summarizeEmpty);
}
// maximum content size to process
int maxLength = PostingsHighlighter.DEFAULT_MAX_LENGTH;
if (attributes.containsKey("maxLength")) {
maxLength = Integer.parseInt(attributes.get("maxLength"));
}
highlighter = new PostingsHighlighter(maxLength) {
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
if (summarizeEmptyBoolean) {
return super.getEmptyHighlight(fieldName, bi, maxPassages);
} else {
return new Passage[0];
}
}
@Override
protected PassageFormatter getFormatter(String fieldName) {
return formatter;
}
@Override
protected PassageScorer getScorer(String fieldName) {
return scorer;
}
};
}
public void init(PluginInfo info) {}
@Override
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
final SolrParams params = req.getParams();
// if highlighting isnt enabled, then why call doHighlighting?
if (isHighlightingEnabled(params)) {
@ -162,11 +109,41 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
String[] keys = getUniqueKeys(searcher, docIDs);
// query-time parameters
int maxLength = params.getInt(HighlightParams.MAX_CHARS, PostingsHighlighter.DEFAULT_MAX_LENGTH);
String[] fieldNames = getHighlightFields(query, req, defaultFields);
// TODO: make this per-field
int numSnippets = params.getInt(HighlightParams.SNIPPETS, 1);
int maxPassages[] = new int[fieldNames.length];
Arrays.fill(maxPassages, numSnippets);
for (int i = 0; i < fieldNames.length; i++) {
maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1);
}
PostingsHighlighter highlighter = new PostingsHighlighter(maxLength) {
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
boolean defaultSummary = params.getFieldBool(fieldName, HighlightParams.DEFAULT_SUMMARY, true);
if (defaultSummary) {
return super.getEmptyHighlight(fieldName, bi, maxPassages);
} else {
return new Passage[0];
}
}
@Override
protected PassageFormatter getFormatter(String fieldName) {
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
return new PassageFormatter(preTag, postTag, ellipsis);
}
@Override
protected PassageScorer getScorer(String fieldName) {
float k1 = params.getFieldFloat(fieldName, HighlightParams.SCORE_K1, 1.2f);
float b = params.getFieldFloat(fieldName, HighlightParams.SCORE_B, 0.75f);
float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
return new PassageScorer(k1, b, pivot);
}
};
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
return encodeSnippets(keys, fieldNames, snippets);

View File

@ -19,9 +19,7 @@ package org.apache.solr.highlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -38,34 +36,9 @@ public abstract class SolrHighlighter
{
public static Logger log = LoggerFactory.getLogger(SolrHighlighter.class);
// Thread safe registry
protected final Map<String,SolrFormatter> formatters =
new HashMap<String, SolrFormatter>();
// Thread safe registry
protected final Map<String,SolrEncoder> encoders =
new HashMap<String, SolrEncoder>();
// Thread safe registry
protected final Map<String,SolrFragmenter> fragmenters =
new HashMap<String, SolrFragmenter>() ;
// Thread safe registry
protected final Map<String, SolrFragListBuilder> fragListBuilders =
new HashMap<String, SolrFragListBuilder>() ;
// Thread safe registry
protected final Map<String, SolrFragmentsBuilder> fragmentsBuilders =
new HashMap<String, SolrFragmentsBuilder>() ;
// Thread safe registry
protected final Map<String, SolrBoundaryScanner> boundaryScanners =
new HashMap<String, SolrBoundaryScanner>() ;
@Deprecated
public abstract void initalize( SolrConfig config );
/**
* Check whether Highlighting is enabled for this request.
* @param params The params controlling Highlighting

View File

@ -22,9 +22,12 @@
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<dataDir>${solr.data.dir:}</dataDir>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
<requestHandler name="standard" class="solr.StandardRequestHandler">
<lst name="defaults">
<bool name="hl.defaultSummary">false</bool>
</lst>
</requestHandler>
<searchComponent class="solr.HighlightComponent" name="highlight">
<highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter" summarizeEmpty="false"/>
<highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter"/>
</searchComponent>
</config>

View File

@ -34,7 +34,7 @@ public class FastVectorHighlighterTest extends SolrTestCaseJ4 {
@Test
public void testConfig(){
SolrHighlighter highlighter = HighlightComponent.getHighlighter(h.getCore());
DefaultSolrHighlighter highlighter = (DefaultSolrHighlighter) HighlightComponent.getHighlighter(h.getCore());
// Make sure we loaded one fragListBuilder
SolrFragListBuilder solrFlbNull = highlighter.fragListBuilders.get( null );

View File

@ -64,7 +64,7 @@ public class HighlighterTest extends SolrTestCaseJ4 {
@Test
public void testConfig()
{
SolrHighlighter highlighter = HighlightComponent.getHighlighter(h.getCore());
DefaultSolrHighlighter highlighter = (DefaultSolrHighlighter) HighlightComponent.getHighlighter(h.getCore());
// Make sure we loaded the one formatter
SolrFormatter fmt1 = highlighter.formatters.get( null );

View File

@ -71,6 +71,14 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
"count(//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/*)=0");
}
public void testDefaultSummary() {
assertQ("null snippet test",
req("q", "text:one OR *:*", "sort", "id asc", "hl", "true", "hl.defaultSummary", "true"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='document <em>one</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second document'");
}
public void testDifferentField() {
assertQ("highlighting text3",
req("q", "text3:document", "sort", "id asc", "hl", "true", "hl.fl", "text3"),
@ -100,4 +108,22 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
}
resetExceptionIgnores();
}
public void testTags() {
assertQ("different pre/post tags",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.tag.pre", "[", "hl.tag.post", "]"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='[document] one'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second [document]'");
}
public void testTagsPerField() {
assertQ("highlighting text and text3",
req("q", "text:document text3:document", "sort", "id asc", "hl", "true", "hl.fl", "text,text3", "f.text3.hl.tag.pre", "[", "f.text3.hl.tag.post", "]"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em> one'",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text3']/str='crappy [document]'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'");
}
}

View File

@ -42,6 +42,7 @@ public interface HighlightParams {
public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language";
public static final String BS_COUNTRY = HIGHLIGHT+".bs.country";
public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
public static final String DEFAULT_SUMMARY = HIGHLIGHT + ".defaultSummary";
public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";
public static final String ALTERNATE_FIELD_LENGTH = HIGHLIGHT+".maxAlternateFieldLength";
public static final String MAX_MULTIVALUED_TO_EXAMINE = HIGHLIGHT + ".maxMultiValuedToExamine";
@ -55,6 +56,7 @@ public interface HighlightParams {
public static final String USE_FVH = HIGHLIGHT + ".useFastVectorHighlighter";
public static final String TAG_PRE = HIGHLIGHT + ".tag.pre";
public static final String TAG_POST = HIGHLIGHT + ".tag.post";
public static final String TAG_ELLIPSIS = HIGHLIGHT + ".tag.ellipsis";
public static final String PHRASE_LIMIT = HIGHLIGHT + ".phraseLimit";
public static final String MULTI_VALUED_SEPARATOR = HIGHLIGHT + ".multiValuedSeparatorChar";
@ -68,4 +70,10 @@ public interface HighlightParams {
public static final String SLOP = HIGHLIGHT+"."+REGEX+".slop";
public static final String PATTERN = HIGHLIGHT+"."+REGEX+".pattern";
public static final String MAX_RE_CHARS = HIGHLIGHT+"."+REGEX+".maxAnalyzedChars";
// Scoring parameters
public static final String SCORE = "score";
public static final String SCORE_K1 = HIGHLIGHT +"."+SCORE+".k1";
public static final String SCORE_B = HIGHLIGHT +"."+SCORE+".b";
public static final String SCORE_PIVOT = HIGHLIGHT +"."+SCORE+".pivot";
}