From 1be5b689640fe4d1bf0ae3fd19c5fe93b20a77ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=A1ndor=20M=C3=A1trav=C3=B6lgyi?= Date: Mon, 23 Dec 2019 17:20:48 -0500 Subject: [PATCH] LUCENE-9091: UnifiedHighlighter HTML escaping should only escape essentials --- lucene/CHANGES.txt | 2 + .../uhighlight/DefaultPassageFormatter.java | 10 +--- .../TestDefaultPassageFormatter.java | 51 +++++++++++++++++++ .../uhighlight/TestUnifiedHighlighter.java | 2 +- .../TestUnifiedHighlighterTermIntervals.java | 2 +- .../TestPostingsSolrHighlighter.java | 2 +- .../highlight/TestUnifiedSolrHighlighter.java | 2 +- 7 files changed, 58 insertions(+), 13 deletions(-) create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9ce49809570..442aefdbae5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -88,6 +88,8 @@ Improvements * LUCENE-9102: Add maxQueryLength option to DirectSpellchecker. (Andy Webb via Bruno Roustant) +* LUCENE-9091: UnifiedHighlighter HTML escaping should only escape essentials (Nándor Mátravölgyi) + Optimizations --------------------- (No changes) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java index 8d2a424d100..701197aa980 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java @@ -129,15 +129,7 @@ public class DefaultPassageFormatter extends PassageFormatter { dest.append("/"); break; default: - if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) { - dest.append(ch); - } else if (ch < 0xff) { - dest.append("&#"); - dest.append((int) ch); - dest.append(";"); - } else { - dest.append(ch); - } + dest.append(ch); } } } else { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java new file mode 100644 index 00000000000..8389dbe0c97 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import org.apache.lucene.util.LuceneTestCase; + +public class TestDefaultPassageFormatter extends LuceneTestCase { + public void testBasic() throws Exception { + String text = "Test customization &
"escaping"
of this very formatter. Unrelated part. It's not very N/A!"; + // fabricate passages with matches to format + Passage[] passages = new Passage[2]; + passages[0] = new Passage(); + passages[0].setStartOffset(0); + passages[0].setEndOffset(text.indexOf(".")+1); + passages[0].addMatch(text.indexOf("very"), text.indexOf("very")+4, null, 2); + passages[1] = new Passage(); + passages[1].setStartOffset(text.indexOf(".", passages[0].getEndOffset()+1) + 2); + passages[1].setEndOffset(text.length()); + passages[1].addMatch( + text.indexOf("very", passages[0].getEndOffset()), + text.indexOf("very", passages[0].getEndOffset())+4, null, 2); + + // test default + DefaultPassageFormatter formatter = new DefaultPassageFormatter(); + assertEquals( + "Test customization &
"escaping"
of this very formatter." + + "... It's not very N/A!", formatter.format(passages, text)); + + // test customization and encoding + formatter = new DefaultPassageFormatter("", "", "\u2026 ", true); + assertEquals( + "Test customization & <div class="xy">&quot;escaping&quot;" + + "</div> of this very formatter.\u2026 It's not very N/A!", + formatter.format(passages, text)); + } +} diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 1e20d43f3af..d91d25b27fd 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -957,7 +957,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase { assertEquals(1, topDocs.totalHits.value); String snippets[] = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); - assertEquals("Just a test highlighting from <i>postings</i>. ", snippets[0]); + assertEquals("Just a test highlighting from <i>postings</i>. ", snippets[0]); ir.close(); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java index 298cd08613c..edf02121ffa 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java @@ -866,7 +866,7 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase { assertEquals(1, topDocs.totalHits.value); String snippets[] = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); - assertEquals("Just a test highlighting from <i>postings</i>. ", snippets[0]); + assertEquals("Just a test highlighting from <i>postings</i>. ", snippets[0]); ir.close(); } diff --git a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java index 4fc402b6d06..02752679d41 100644 --- a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java +++ b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java @@ -171,7 +171,7 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 { assertU(commit()); assertQ("html escaped", req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"), - "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first <i>sentence</i>.'"); + "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first <i>sentence</i>.'"); } public void testWildcard() { diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java index 990262e7a0f..ac3b7eaddcd 100644 --- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java +++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java @@ -274,7 +274,7 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 { assertU(commit()); assertQ("html escaped", req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"), - "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first <i>sentence</i>.'"); + "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first <i>sentence</i>.'"); } public void testRangeQuery() {