SOLR-4684: add encoder config to PostingsSolrHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465258 13f79535-47bb-0310-9956-ffa450edef68
2013-04-06 15:14:46 +00:00 · 2013-04-06 15:14:46 +00:00 · b1b1247a65
parent 3351c61484
commit b1b1247a65
4 changed files with 97 additions and 6 deletions
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
@ -28,12 +28,13 @@ public class PassageFormatter {
  private final String preTag;
  private final String postTag;
  private final String ellipsis;
+  private final boolean escape;
  
  /**
   * Creates a new PassageFormatter with the default tags.
   */
  public PassageFormatter() {
-    this("<b>", "</b>", "... ");
+    this("<b>", "</b>", "... ", false);
  }
  
  /**
@ -41,14 +42,16 @@ public class PassageFormatter {
   * @param preTag text which should appear before a highlighted term.
   * @param postTag text which should appear after a highlighted term.
   * @param ellipsis text which should be used to connect two unconnected passages.
+   * @param escape true if text should be html-escaped
   */
-  public PassageFormatter(String preTag, String postTag, String ellipsis) {
+  public PassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
    if (preTag == null || postTag == null || ellipsis == null) {
      throw new NullPointerException();
    }
    this.preTag = preTag;
    this.postTag = postTag;
    this.ellipsis = ellipsis;
+    this.escape = escape;
  }
  
  /**
@ -74,19 +77,60 @@ public class PassageFormatter {
        int end = passage.matchEnds[i];
        // its possible to have overlapping terms
        if (start > pos) {
-          sb.append(content.substring(pos, start));
+          append(sb, content, pos, start);
        }
        if (end > pos) {
          sb.append(preTag);
-          sb.append(content.substring(Math.max(pos, start), end));
+          append(sb, content, Math.max(pos, start), end);
          sb.append(postTag);
          pos = end;
        }
      }
      // its possible a "term" from the analyzer could span a sentence boundary.
-      sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
+      append(sb, content, pos, Math.max(pos, passage.endOffset));
      pos = passage.endOffset;
    }
    return sb.toString();
  }
+
+  private void append(StringBuilder dest, String content, int start, int end) {
+    if (escape) {
+      // note: these are the rules from owasp.org
+      for (int i = start; i < end; i++) {
+        char ch = content.charAt(i);
+        switch(ch) {
+          case '&': 
+            dest.append("&amp;");
+            break;
+          case '<':
+            dest.append("&lt;");
+            break;
+          case '>':
+            dest.append("&gt;");
+            break;
+          case '"':
+            dest.append("&quot;");
+            break;
+          case '\'':
+            dest.append("&#x27;");
+            break;
+          case '/':
+            dest.append("&#x2F;");
+            break;
+          default:
+            if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
+              dest.append(ch);
+            } else if (ch < 0xff) {
+              dest.append("&#");
+              dest.append((int)ch);
+              dest.append(";");
+            } else {
+              dest.append(ch);
+            }
+        }
+      }
+    } else {
+      dest.append(content, start, end);
+    }
+  }
 }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@ -849,4 +849,40 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    ir.close();
    dir.close();
  }
+  
+  public void testEncode() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+      @Override
+      protected PassageFormatter getFormatter(String field) {
+        return new PassageFormatter("<b>", "</b>", "... ", true);
+      }
+    };
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+    assertEquals(1, snippets.length);
+    assertEquals("Just&#32;a&#32;test&#32;<b>highlighting</b>&#32;from&#32;&lt;i&gt;postings&lt;&#x2F;i&gt;&#46;&#32;", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
 }
--- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
@ -57,6 +57,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
 *       &lt;str name="hl.tag.post"&gt;&amp;lt;/em&amp;gt;&lt;/str&gt;
 *       &lt;str name="hl.tag.ellipsis"&gt;... &lt;/str&gt;
 *       &lt;bool name="hl.defaultSummary"&gt;true&lt;/bool&gt;
+ *       &lt;str name="hl.encoder"&gt;simple&lt;/str&gt;
 *       &lt;float name="hl.score.k1"&gt;1.2&lt;/float&gt;
 *       &lt;float name="hl.score.b"&gt;0.75&lt;/float&gt;
 *       &lt;float name="hl.score.pivot"&gt;87&lt;/float&gt;
@ -85,6 +86,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
 *    <li>hl.tag.post (string) specifies text which appears after a highlighted term.
 *    <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
 *    <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
+ *    <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
 *    <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
 *    <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
 *    <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
@ -143,7 +145,8 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
          String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
          String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
          String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
-          return new PassageFormatter(preTag, postTag, ellipsis);
+          String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
+          return new PassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
        }

        @Override
--- a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
@ -147,4 +147,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
  }
+  
+  public void testEncoder() {
+    assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
+    assertU(commit());
+    assertQ("html escaped", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
+        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em>&#32;one&#32;has&#32;a&#32;first&#32;&lt;i&gt;sentence&lt;&#x2F;i&gt;&#46;'");
+  }
 }