mirror of https://github.com/apache/lucene.git
SOLR-4683: add BreakIterator config to PostingsSolrHighlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465249 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
57bf443834
commit
609c2f2065
|
@ -20,6 +20,7 @@ package org.apache.solr.highlight;
|
|||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Collections;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -29,6 +30,7 @@ import org.apache.lucene.search.postingshighlight.Passage;
|
|||
import org.apache.lucene.search.postingshighlight.PassageFormatter;
|
||||
import org.apache.lucene.search.postingshighlight.PassageScorer;
|
||||
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
|
||||
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -58,6 +60,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* <float name="hl.score.k1">1.2</float>
|
||||
* <float name="hl.score.b">0.75</float>
|
||||
* <float name="hl.score.pivot">87</float>
|
||||
* <str name="hl.bs.language"></str>
|
||||
* <str name="hl.bs.country"></str>
|
||||
* <str name="hl.bs.variant"></str>
|
||||
* <str name="hl.bs.type">SENTENCE</str>
|
||||
* <int name="hl.maxAnalyzedChars">10000</int>
|
||||
* </lst>
|
||||
* </requestHandler>
|
||||
|
@ -74,7 +80,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
|
||||
* <li>hl.q (string) can specify the query
|
||||
* <li>hl.fl (string) specifies the field list.
|
||||
* <li>hl.snippets (int) specifies how many underlying sentence fragments form the resulting snippet.
|
||||
* <li>hl.snippets (int) specifies how many underlying passages form the resulting snippet.
|
||||
* <li>hl.tag.pre (string) specifies text which appears before a highlighted term.
|
||||
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
|
||||
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
|
||||
|
@ -82,6 +88,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
|
||||
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
|
||||
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
|
||||
* <li>hl.bs.type (string) specifies how to divide text into passages: [SENTENCE, LINE, WORD, CHAR, WHOLE]
|
||||
* <li>hl.bs.language (string) specifies language code for BreakIterator. default is empty string (root locale)
|
||||
* <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
|
||||
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
|
||||
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
|
||||
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
|
||||
* </ul>
|
||||
|
@ -143,6 +153,16 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
|||
float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
|
||||
return new PassageScorer(k1, b, pivot);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
|
||||
String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
|
||||
String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
|
||||
Locale locale = parseLocale(language, country, variant);
|
||||
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
|
||||
return parseBreakIterator(type, locale);
|
||||
}
|
||||
};
|
||||
|
||||
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
|
||||
|
@ -212,4 +232,36 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
|||
return new String[docIDs.length];
|
||||
}
|
||||
}
|
||||
|
||||
/** parse a break iterator type for the specified locale */
|
||||
protected BreakIterator parseBreakIterator(String type, Locale locale) {
|
||||
if (type == null || "SENTENCE".equals(type)) {
|
||||
return BreakIterator.getSentenceInstance(locale);
|
||||
} else if ("LINE".equals(type)) {
|
||||
return BreakIterator.getLineInstance(locale);
|
||||
} else if ("WORD".equals(type)) {
|
||||
return BreakIterator.getWordInstance(locale);
|
||||
} else if ("CHARACTER".equals(type)) {
|
||||
return BreakIterator.getCharacterInstance(locale);
|
||||
} else if ("WHOLE".equals(type)) {
|
||||
return new WholeBreakIterator();
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
|
||||
}
|
||||
}
|
||||
|
||||
/** parse a locale from a language+country+variant spec */
|
||||
protected Locale parseLocale(String language, String country, String variant) {
|
||||
if (language == null && country == null && variant == null) {
|
||||
return Locale.ROOT;
|
||||
} else if (language != null && country == null && variant != null) {
|
||||
throw new IllegalArgumentException("To specify variant, country is required");
|
||||
} else if (language != null && country != null && variant != null) {
|
||||
return new Locale(language, country, variant);
|
||||
} else if (language != null && country != null) {
|
||||
return new Locale(language, country);
|
||||
} else {
|
||||
return new Locale(language);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
<fieldtype name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
@ -33,6 +34,7 @@
|
|||
<fieldtype name="text_offsets" class="solr.TextField" storeOffsetsWithPositions="true">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
</types>
|
||||
|
|
|
@ -42,7 +42,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
|
|||
assertTrue(schema.getField("text").storeOffsetsWithPositions());
|
||||
assertTrue(schema.getField("text3").storeOffsetsWithPositions());
|
||||
assertFalse(schema.getField("text2").storeOffsetsWithPositions());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
clearIndex();
|
||||
assertU(adoc("text", "document one", "text2", "document one", "text3", "crappy document", "id", "101"));
|
||||
assertU(adoc("text", "second document", "text2", "second document", "text3", "crappier document", "id", "102"));
|
||||
assertU(commit());
|
||||
|
@ -126,4 +131,20 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
|
|||
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'",
|
||||
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'");
|
||||
}
|
||||
|
||||
public void testBreakIterator() {
|
||||
assertQ("different breakiterator",
|
||||
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD"),
|
||||
"count(//lst[@name='highlighting']/*)=2",
|
||||
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em>'",
|
||||
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='<em>document</em>'");
|
||||
}
|
||||
|
||||
public void testBreakIterator2() {
|
||||
assertU(adoc("text", "Document one has a first sentence. Document two has a second sentence.", "id", "103"));
|
||||
assertU(commit());
|
||||
assertQ("different breakiterator",
|
||||
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
|
||||
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@ public interface HighlightParams {
|
|||
public static final String BS_TYPE = HIGHLIGHT+".bs.type";
|
||||
public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language";
|
||||
public static final String BS_COUNTRY = HIGHLIGHT+".bs.country";
|
||||
public static final String BS_VARIANT = HIGHLIGHT+".bs.variant";
|
||||
public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
|
||||
public static final String DEFAULT_SUMMARY = HIGHLIGHT + ".defaultSummary";
|
||||
public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";
|
||||
|
|
Loading…
Reference in New Issue