SOLR-4683: add BreakIterator config to PostingsSolrHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465249 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-04-06 14:17:40 +00:00
parent 57bf443834
commit 609c2f2065
4 changed files with 78 additions and 2 deletions

View File

@ -20,6 +20,7 @@ package org.apache.solr.highlight;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -29,6 +30,7 @@ import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PassageScorer;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@ -58,6 +60,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <float name="hl.score.k1">1.2</float>
* <float name="hl.score.b">0.75</float>
* <float name="hl.score.pivot">87</float>
* <str name="hl.bs.language"></str>
* <str name="hl.bs.country"></str>
* <str name="hl.bs.variant"></str>
* <str name="hl.bs.type">SENTENCE</str>
* <int name="hl.maxAnalyzedChars">10000</int>
* </lst>
* </requestHandler>
@ -74,7 +80,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
* <li>hl.q (string) can specify the query
* <li>hl.fl (string) specifies the field list.
* <li>hl.snippets (int) specifies how many underlying sentence fragments form the resulting snippet.
* <li>hl.snippets (int) specifies how many underlying passages form the resulting snippet.
* <li>hl.tag.pre (string) specifies text which appears before a highlighted term.
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
@ -82,6 +88,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
* <li>hl.bs.type (string) specifies how to divide text into passages: [SENTENCE, LINE, WORD, CHAR, WHOLE]
* <li>hl.bs.language (string) specifies language code for BreakIterator. default is empty string (root locale)
* <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
* </ul>
@ -143,6 +153,16 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
return new PassageScorer(k1, b, pivot);
}
@Override
protected BreakIterator getBreakIterator(String field) {
String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
Locale locale = parseLocale(language, country, variant);
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
return parseBreakIterator(type, locale);
}
};
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
@ -212,4 +232,36 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
return new String[docIDs.length];
}
}
/** parse a break iterator type for the specified locale */
protected BreakIterator parseBreakIterator(String type, Locale locale) {
if (type == null || "SENTENCE".equals(type)) {
return BreakIterator.getSentenceInstance(locale);
} else if ("LINE".equals(type)) {
return BreakIterator.getLineInstance(locale);
} else if ("WORD".equals(type)) {
return BreakIterator.getWordInstance(locale);
} else if ("CHARACTER".equals(type)) {
return BreakIterator.getCharacterInstance(locale);
} else if ("WHOLE".equals(type)) {
return new WholeBreakIterator();
} else {
throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
}
}
/** parse a locale from a language+country+variant spec */
protected Locale parseLocale(String language, String country, String variant) {
if (language == null && country == null && variant == null) {
return Locale.ROOT;
} else if (language != null && country == null && variant != null) {
throw new IllegalArgumentException("To specify variant, country is required");
} else if (language != null && country != null && variant != null) {
return new Locale(language, country, variant);
} else if (language != null && country != null) {
return new Locale(language, country);
} else {
return new Locale(language);
}
}
}

View File

@ -26,6 +26,7 @@
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
@ -33,6 +34,7 @@
<fieldtype name="text_offsets" class="solr.TextField" storeOffsetsWithPositions="true">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
</types>

View File

@ -42,7 +42,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
assertTrue(schema.getField("text").storeOffsetsWithPositions());
assertTrue(schema.getField("text3").storeOffsetsWithPositions());
assertFalse(schema.getField("text2").storeOffsetsWithPositions());
}
@Override
public void setUp() throws Exception {
super.setUp();
clearIndex();
assertU(adoc("text", "document one", "text2", "document one", "text3", "crappy document", "id", "101"));
assertU(adoc("text", "second document", "text2", "second document", "text3", "crappier document", "id", "102"));
assertU(commit());
@ -126,4 +131,20 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'");
}
public void testBreakIterator() {
assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='<em>document</em>'");
}
public void testBreakIterator2() {
assertU(adoc("text", "Document one has a first sentence. Document two has a second sentence.", "id", "103"));
assertU(commit());
assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
}

View File

@ -41,6 +41,7 @@ public interface HighlightParams {
public static final String BS_TYPE = HIGHLIGHT+".bs.type";
public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language";
public static final String BS_COUNTRY = HIGHLIGHT+".bs.country";
public static final String BS_VARIANT = HIGHLIGHT+".bs.variant";
public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
public static final String DEFAULT_SUMMARY = HIGHLIGHT + ".defaultSummary";
public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";