SOLR-4684: add encoder config to PostingsSolrHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-04-06 15:14:46 +00:00
parent 3351c61484
commit b1b1247a65
4 changed files with 97 additions and 6 deletions

View File

@ -28,12 +28,13 @@ public class PassageFormatter {
private final String preTag;
private final String postTag;
private final String ellipsis;
private final boolean escape;
/**
* Creates a new PassageFormatter with the default tags.
*/
public PassageFormatter() {
this("<b>", "</b>", "... ");
this("<b>", "</b>", "... ", false);
}
/**
@ -41,14 +42,16 @@ public class PassageFormatter {
* @param preTag text which should appear before a highlighted term.
* @param postTag text which should appear after a highlighted term.
* @param ellipsis text which should be used to connect two unconnected passages.
* @param escape true if text should be html-escaped
*/
public PassageFormatter(String preTag, String postTag, String ellipsis) {
public PassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
if (preTag == null || postTag == null || ellipsis == null) {
throw new NullPointerException();
}
this.preTag = preTag;
this.postTag = postTag;
this.ellipsis = ellipsis;
this.escape = escape;
}
/**
@ -74,19 +77,60 @@ public class PassageFormatter {
int end = passage.matchEnds[i];
// its possible to have overlapping terms
if (start > pos) {
sb.append(content.substring(pos, start));
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
sb.append(content.substring(Math.max(pos, start), end));
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
append(sb, content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
}
return sb.toString();
}
private void append(StringBuilder dest, String content, int start, int end) {
if (escape) {
// note: these are the rules from owasp.org
for (int i = start; i < end; i++) {
char ch = content.charAt(i);
switch(ch) {
case '&':
dest.append("&amp;");
break;
case '<':
dest.append("&lt;");
break;
case '>':
dest.append("&gt;");
break;
case '"':
dest.append("&quot;");
break;
case '\'':
dest.append("&#x27;");
break;
case '/':
dest.append("&#x2F;");
break;
default:
if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
dest.append(ch);
} else if (ch < 0xff) {
dest.append("&#");
dest.append((int)ch);
dest.append(";");
} else {
dest.append(ch);
}
}
}
} else {
dest.append(content, start, end);
}
}
}

View File

@ -849,4 +849,40 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close();
dir.close();
}
public void testEncode() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected PassageFormatter getFormatter(String field) {
return new PassageFormatter("<b>", "</b>", "... ", true);
}
};
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(1, snippets.length);
assertEquals("Just&#32;a&#32;test&#32;<b>highlighting</b>&#32;from&#32;&lt;i&gt;postings&lt;&#x2F;i&gt;&#46;&#32;", snippets[0]);
ir.close();
dir.close();
}
}

View File

@ -57,6 +57,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* &lt;str name="hl.tag.post"&gt;&amp;lt;/em&amp;gt;&lt;/str&gt;
* &lt;str name="hl.tag.ellipsis"&gt;... &lt;/str&gt;
* &lt;bool name="hl.defaultSummary"&gt;true&lt;/bool&gt;
* &lt;str name="hl.encoder"&gt;simple&lt;/str&gt;
* &lt;float name="hl.score.k1"&gt;1.2&lt;/float&gt;
* &lt;float name="hl.score.b"&gt;0.75&lt;/float&gt;
* &lt;float name="hl.score.pivot"&gt;87&lt;/float&gt;
@ -85,6 +86,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
* <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
* <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
@ -143,7 +145,8 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
return new PassageFormatter(preTag, postTag, ellipsis);
String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
return new PassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
}
@Override

View File

@ -147,4 +147,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
public void testEncoder() {
assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
assertU(commit());
assertQ("html escaped",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em>&#32;one&#32;has&#32;a&#32;first&#32;&lt;i&gt;sentence&lt;&#x2F;i&gt;&#46;'");
}
}