LUCENE-5191: Fix Unicode corrumption in HTML escaping of Standard Highlighter and Fast Vector Highlighter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1518839 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2013-08-29 22:01:46 +00:00
parent 6f0040797e
commit dc0dca5172
3 changed files with 14 additions and 15 deletions

View File

@ -152,6 +152,11 @@ Bug Fixes
* LUCENE-5192: IndexWriter could allow adding same field name with different
DocValueTypes under some circumstances. (Shai Erera)
* LUCENE-5191: SimpleHTMLEncoder in Highlighter module broke Unicode
outside BMP because it encoded UTF-16 chars instead of codepoints.
The escaping of codepoints > 127 was removed (not needed for valid HTML)
and missing escaping for ' and / was added. (Uwe Schindler)
API Changes
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.

View File

@ -47,34 +47,28 @@ public class SimpleHTMLEncoder implements Encoder
{
char ch = plainText.charAt(index);
switch (ch)
{
switch (ch) {
case '"':
result.append(""");
break;
case '&':
result.append("&");
break;
case '<':
result.append("&lt;");
break;
case '>':
result.append("&gt;");
break;
case '\'':
result.append("&#x27;");
break;
case '/':
result.append("&#x2F;");
break;
default:
if (ch < 128)
{
result.append(ch);
}
else
{
result.append("&#").append((int)ch).append(";");
}
}
}
return result.toString();

View File

@ -85,7 +85,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
String[] preTags = { "[" };
String[] postTags = { "]" };
assertEquals( "&lt;h1&gt; [a] &lt;/h1&gt;",
assertEquals( "&lt;h1&gt; [a] &lt;&#x2F;h1&gt;",
sfb.createFragment( reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder() ) );
}