LUCENE-9091: UnifiedHighlighter HTML escaping should only

escape essentials
This commit is contained in:
Nándor Mátravölgyi 2019-12-23 17:20:48 -05:00 committed by David Smiley
parent 403fd05646
commit 1be5b68964
7 changed files with 58 additions and 13 deletions

View File

@ -88,6 +88,8 @@ Improvements
* LUCENE-9102: Add maxQueryLength option to DirectSpellchecker. (Andy Webb via Bruno Roustant) * LUCENE-9102: Add maxQueryLength option to DirectSpellchecker. (Andy Webb via Bruno Roustant)
* LUCENE-9091: UnifiedHighlighter HTML escaping should only escape essentials (Nándor Mátravölgyi)
Optimizations Optimizations
--------------------- ---------------------
(No changes) (No changes)

View File

@ -129,15 +129,7 @@ public class DefaultPassageFormatter extends PassageFormatter {
dest.append("/"); dest.append("/");
break; break;
default: default:
if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) { dest.append(ch);
dest.append(ch);
} else if (ch < 0xff) {
dest.append("&#");
dest.append((int) ch);
dest.append(";");
} else {
dest.append(ch);
}
} }
} }
} else { } else {

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.util.LuceneTestCase;
public class TestDefaultPassageFormatter extends LuceneTestCase {
public void testBasic() throws Exception {
String text = "Test customization & <div class=\"xy\">&quot;escaping&quot;</div> of this very formatter. Unrelated part. It's not very N/A!";
// fabricate passages with matches to format
Passage[] passages = new Passage[2];
passages[0] = new Passage();
passages[0].setStartOffset(0);
passages[0].setEndOffset(text.indexOf(".")+1);
passages[0].addMatch(text.indexOf("very"), text.indexOf("very")+4, null, 2);
passages[1] = new Passage();
passages[1].setStartOffset(text.indexOf(".", passages[0].getEndOffset()+1) + 2);
passages[1].setEndOffset(text.length());
passages[1].addMatch(
text.indexOf("very", passages[0].getEndOffset()),
text.indexOf("very", passages[0].getEndOffset())+4, null, 2);
// test default
DefaultPassageFormatter formatter = new DefaultPassageFormatter();
assertEquals(
"Test customization & <div class=\"xy\">&quot;escaping&quot;</div> of this <b>very</b> formatter." +
"... It's not <b>very</b> N/A!", formatter.format(passages, text));
// test customization and encoding
formatter = new DefaultPassageFormatter("<u>", "</u>", "\u2026 ", true);
assertEquals(
"Test customization &amp; &lt;div class=&quot;xy&quot;&gt;&amp;quot;escaping&amp;quot;" +
"&lt;&#x2F;div&gt; of this <u>very</u> formatter.\u2026 It&#x27;s not <u>very</u> N&#x2F;A!",
formatter.format(passages, text));
}
}

View File

@ -957,7 +957,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
assertEquals(1, topDocs.totalHits.value); assertEquals(1, topDocs.totalHits.value);
String snippets[] = highlighter.highlight("body", query, topDocs); String snippets[] = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length); assertEquals(1, snippets.length);
assertEquals("Just&#32;a&#32;test&#32;<b>highlighting</b>&#32;from&#32;&lt;i&gt;postings&lt;&#x2F;i&gt;&#46;&#32;", snippets[0]); assertEquals("Just a test <b>highlighting</b> from &lt;i&gt;postings&lt;&#x2F;i&gt;. ", snippets[0]);
ir.close(); ir.close();
} }

View File

@ -866,7 +866,7 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase {
assertEquals(1, topDocs.totalHits.value); assertEquals(1, topDocs.totalHits.value);
String snippets[] = highlighter.highlight("body", query, topDocs); String snippets[] = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length); assertEquals(1, snippets.length);
assertEquals("Just&#32;a&#32;test&#32;<b>highlighting</b>&#32;from&#32;&lt;i&gt;postings&lt;&#x2F;i&gt;&#46;&#32;", snippets[0]); assertEquals("Just a test <b>highlighting</b> from &lt;i&gt;postings&lt;&#x2F;i&gt;. ", snippets[0]);
ir.close(); ir.close();
} }

View File

@ -171,7 +171,7 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
assertU(commit()); assertU(commit());
assertQ("html escaped", assertQ("html escaped",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"), req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em>&#32;one&#32;has&#32;a&#32;first&#32;&lt;i&gt;sentence&lt;&#x2F;i&gt;&#46;'"); "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first &lt;i&gt;sentence&lt;&#x2F;i&gt;.'");
} }
public void testWildcard() { public void testWildcard() {

View File

@ -274,7 +274,7 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertU(commit()); assertU(commit());
assertQ("html escaped", assertQ("html escaped",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"), req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em>&#32;one&#32;has&#32;a&#32;first&#32;&lt;i&gt;sentence&lt;&#x2F;i&gt;&#46;'"); "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first &lt;i&gt;sentence&lt;&#x2F;i&gt;.'");
} }
public void testRangeQuery() { public void testRangeQuery() {