mirror of https://github.com/apache/lucene.git
SOLR-4684: add encoder config to PostingsSolrHighlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3351c61484
commit
b1b1247a65
|
@ -28,12 +28,13 @@ public class PassageFormatter {
|
||||||
private final String preTag;
|
private final String preTag;
|
||||||
private final String postTag;
|
private final String postTag;
|
||||||
private final String ellipsis;
|
private final String ellipsis;
|
||||||
|
private final boolean escape;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new PassageFormatter with the default tags.
|
* Creates a new PassageFormatter with the default tags.
|
||||||
*/
|
*/
|
||||||
public PassageFormatter() {
|
public PassageFormatter() {
|
||||||
this("<b>", "</b>", "... ");
|
this("<b>", "</b>", "... ", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -41,14 +42,16 @@ public class PassageFormatter {
|
||||||
* @param preTag text which should appear before a highlighted term.
|
* @param preTag text which should appear before a highlighted term.
|
||||||
* @param postTag text which should appear after a highlighted term.
|
* @param postTag text which should appear after a highlighted term.
|
||||||
* @param ellipsis text which should be used to connect two unconnected passages.
|
* @param ellipsis text which should be used to connect two unconnected passages.
|
||||||
|
* @param escape true if text should be html-escaped
|
||||||
*/
|
*/
|
||||||
public PassageFormatter(String preTag, String postTag, String ellipsis) {
|
public PassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
|
||||||
if (preTag == null || postTag == null || ellipsis == null) {
|
if (preTag == null || postTag == null || ellipsis == null) {
|
||||||
throw new NullPointerException();
|
throw new NullPointerException();
|
||||||
}
|
}
|
||||||
this.preTag = preTag;
|
this.preTag = preTag;
|
||||||
this.postTag = postTag;
|
this.postTag = postTag;
|
||||||
this.ellipsis = ellipsis;
|
this.ellipsis = ellipsis;
|
||||||
|
this.escape = escape;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -74,19 +77,60 @@ public class PassageFormatter {
|
||||||
int end = passage.matchEnds[i];
|
int end = passage.matchEnds[i];
|
||||||
// its possible to have overlapping terms
|
// its possible to have overlapping terms
|
||||||
if (start > pos) {
|
if (start > pos) {
|
||||||
sb.append(content.substring(pos, start));
|
append(sb, content, pos, start);
|
||||||
}
|
}
|
||||||
if (end > pos) {
|
if (end > pos) {
|
||||||
sb.append(preTag);
|
sb.append(preTag);
|
||||||
sb.append(content.substring(Math.max(pos, start), end));
|
append(sb, content, Math.max(pos, start), end);
|
||||||
sb.append(postTag);
|
sb.append(postTag);
|
||||||
pos = end;
|
pos = end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||||
sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
|
append(sb, content, pos, Math.max(pos, passage.endOffset));
|
||||||
pos = passage.endOffset;
|
pos = passage.endOffset;
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void append(StringBuilder dest, String content, int start, int end) {
|
||||||
|
if (escape) {
|
||||||
|
// note: these are the rules from owasp.org
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
char ch = content.charAt(i);
|
||||||
|
switch(ch) {
|
||||||
|
case '&':
|
||||||
|
dest.append("&");
|
||||||
|
break;
|
||||||
|
case '<':
|
||||||
|
dest.append("<");
|
||||||
|
break;
|
||||||
|
case '>':
|
||||||
|
dest.append(">");
|
||||||
|
break;
|
||||||
|
case '"':
|
||||||
|
dest.append(""");
|
||||||
|
break;
|
||||||
|
case '\'':
|
||||||
|
dest.append("'");
|
||||||
|
break;
|
||||||
|
case '/':
|
||||||
|
dest.append("/");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
|
||||||
|
dest.append(ch);
|
||||||
|
} else if (ch < 0xff) {
|
||||||
|
dest.append("&#");
|
||||||
|
dest.append((int)ch);
|
||||||
|
dest.append(";");
|
||||||
|
} else {
|
||||||
|
dest.append(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dest.append(content, start, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -849,4 +849,40 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
||||||
ir.close();
|
ir.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEncode() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||||
|
iwc.setMergePolicy(newLogMergePolicy());
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
|
||||||
|
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||||
|
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
Field body = new Field("body", "", offsetsType);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(body);
|
||||||
|
|
||||||
|
body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore.");
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader ir = iw.getReader();
|
||||||
|
iw.close();
|
||||||
|
|
||||||
|
IndexSearcher searcher = newSearcher(ir);
|
||||||
|
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||||
|
@Override
|
||||||
|
protected PassageFormatter getFormatter(String field) {
|
||||||
|
return new PassageFormatter("<b>", "</b>", "... ", true);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||||
|
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||||
|
assertEquals(1, topDocs.totalHits);
|
||||||
|
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||||
|
assertEquals(1, snippets.length);
|
||||||
|
assertEquals("Just a test <b>highlighting</b> from <i>postings</i>. ", snippets[0]);
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,6 +57,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
||||||
* <str name="hl.tag.post">&lt;/em&gt;</str>
|
* <str name="hl.tag.post">&lt;/em&gt;</str>
|
||||||
* <str name="hl.tag.ellipsis">... </str>
|
* <str name="hl.tag.ellipsis">... </str>
|
||||||
* <bool name="hl.defaultSummary">true</bool>
|
* <bool name="hl.defaultSummary">true</bool>
|
||||||
|
* <str name="hl.encoder">simple</str>
|
||||||
* <float name="hl.score.k1">1.2</float>
|
* <float name="hl.score.k1">1.2</float>
|
||||||
* <float name="hl.score.b">0.75</float>
|
* <float name="hl.score.b">0.75</float>
|
||||||
* <float name="hl.score.pivot">87</float>
|
* <float name="hl.score.pivot">87</float>
|
||||||
|
@ -85,6 +86,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
||||||
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
|
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
|
||||||
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
|
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
|
||||||
* <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
|
* <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
|
||||||
|
* <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
|
||||||
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
|
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
|
||||||
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
|
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
|
||||||
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
|
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
|
||||||
|
@ -143,7 +145,8 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
||||||
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
|
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
|
||||||
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
|
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
|
||||||
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
|
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
|
||||||
return new PassageFormatter(preTag, postTag, ellipsis);
|
String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
|
||||||
|
return new PassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -147,4 +147,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
|
||||||
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
|
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
|
||||||
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
|
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEncoder() {
|
||||||
|
assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
|
||||||
|
assertU(commit());
|
||||||
|
assertQ("html escaped",
|
||||||
|
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
|
||||||
|
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first <i>sentence</i>.'");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue