SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1588136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-04-17 06:08:17 +00:00
parent 99fea41839
commit 2acbcd08cc
4 changed files with 148 additions and 337 deletions

View File

@ -283,6 +283,9 @@ Bug fixes
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley) * LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
* SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly.
(Dan Funk, Steve Rowe)
Test Framework Test Framework
* LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss) * LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss)

View File

@ -31134,7 +31134,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
cumulativeDiff += inputSegment.length() - outputSegment.length(); cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length) // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff); addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar(); eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break; break;
} }
case BANG: case BANG:
@ -31147,7 +31147,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
case LEFT_ANGLE_BRACKET_SLASH: case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment; outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar(); eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break; break;
} }
default: { default: {
@ -31506,7 +31506,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
} }
case 84: break; case 84: break;
case 32: case 32:
{ yybegin(COMMENT); { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
inputSegment.append(yytext());
} else {
yybegin(COMMENT);
}
} }
case 85: break; case 85: break;
case 33: case 33:
@ -31611,12 +31615,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
} }
case 99: break; case 99: break;
case 47: case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
cumulativeDiff += inputSegment.length() + yylength(); inputSegment.append(yytext());
// position the correction at (already output length) [ + (substitution length) = 0 ] } else {
addOffCorrectMap(outputCharCount, cumulativeDiff); // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
inputSegment.clear(); cumulativeDiff += inputSegment.length() + yylength();
yybegin(CDATA); // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
} }
case 100: break; case 100: break;
case 48: case 48:

View File

@ -309,7 +309,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
cumulativeDiff += inputSegment.length() - outputSegment.length(); cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length) // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff); addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar(); eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break; break;
} }
case BANG: case BANG:
@ -322,7 +322,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
case LEFT_ANGLE_BRACKET_SLASH: case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment; outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar(); eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break; break;
} }
default: { default: {
@ -754,7 +754,13 @@ InlineElment = ( [aAbBiIqQsSuU] |
} }
<BANG> { <BANG> {
"--" { yybegin(COMMENT); } "--" {
if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
inputSegment.append(yytext());
} else {
yybegin(COMMENT);
}
}
">" { ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength(); cumulativeDiff += inputSegment.length() + yylength();
@ -771,12 +777,16 @@ InlineElment = ( [aAbBiIqQsSuU] |
// [21] CDEnd ::= ']]>' // [21] CDEnd ::= ']]>'
// //
"[CDATA[" { "[CDATA[" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
cumulativeDiff += inputSegment.length() + yylength(); inputSegment.append(yytext());
// position the correction at (already output length) [ + (substitution length) = 0 ] } else {
addOffCorrectMap(outputCharCount, cumulativeDiff); // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
inputSegment.clear(); cumulativeDiff += inputSegment.length() + yylength();
yybegin(CDATA); // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
} }
[^] { [^] {
inputSegment.append(zzBuffer[zzStartRead]); inputSegment.append(zzBuffer[zzStartRead]);

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.charfilter;
*/ */
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
@ -26,6 +25,7 @@ import java.io.StringReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
@ -53,27 +53,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an & //this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
// //
public void test() throws IOException { public void test() throws Exception {
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " + String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " + "another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->"; "This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->";
String gold = "\nthis is some text\n here is a link and " + String gold = "\nthis is some text\n here is a link and " +
"another link. " + "another link. " +
"This is an entity: & plus a <. Here is an &. "; "This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html)); assertHTMLStripsTo(html, gold, null);
StringBuilder builder = new StringBuilder();
int ch = -1;
char [] goldArray = gold.toCharArray();
int position = 0;
while ((ch = reader.read()) != -1){
char theChar = (char) ch;
builder.append(theChar);
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
//Some sanity checks, but not a full-fledged check //Some sanity checks, but not a full-fledged check
@ -100,61 +87,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "This is a test"; String gold = "This is a test";
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = 0; int ch = 0;
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
gold, builder.toString().trim());
}
public void testGamma() throws Exception {
String test = "&Gamma;";
String gold = "\u0393";
Set<String> set = new HashSet<>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
builder.append((char)ch); builder.append((char)ch);
} }
String result = builder.toString(); // Compare trim()'d output to gold
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result); assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
gold, builder.toString().trim());
}
public void testGamma() throws Exception {
assertHTMLStripsTo("&Gamma;", "\u0393", new HashSet<>(Arrays.asList("reserved")));
} }
public void testEntities() throws Exception { public void testEntities() throws Exception {
String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;"; String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393"; String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
Set<String> set = new HashSet<>(); assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
} }
public void testMoreEntities() throws Exception { public void testMoreEntities() throws Exception {
String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;"; String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
String gold = " <junk/> ! @ and "; String gold = " <junk/> ! @ and ";
Set<String> set = new HashSet<>(); assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
} }
public void testReserved() throws Exception { public void testReserved() throws Exception {
@ -344,16 +298,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
"\n\n\n\n\n\n\n\n", "\n\n\n\n\n\n\n\n",
}; };
for (int i = 0 ; i < testGold.length ; i += 2) { for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i]; assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
} }
} }
@ -362,7 +307,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50); StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
testBuilder.append("ah<?> ??????"); testBuilder.append("ah<?> ??????");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500); appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions Reader reader = new HTMLStripCharFilter
(new BufferedReader(new StringReader(testBuilder.toString()))); //force the use of BufferedReader
assertHTMLStripsTo(reader, testBuilder.toString(), null);
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<!--");//comments testBuilder.append("<!--");//comments
@ -370,54 +317,21 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
testBuilder.append("-->foo"); testBuilder.append("-->foo");
String gold = "foo"; String gold = "foo";
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString())); assertHTMLStripsTo(testBuilder.toString(), gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<?"); testBuilder.append("<?");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500); appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>"); testBuilder.append("?>");
gold = ""; gold = "";
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString())); assertHTMLStripsTo(testBuilder.toString(), gold, null);
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<b "); testBuilder.append("<b ");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500); appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>"); testBuilder.append("/>");
gold = ""; gold = "";
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString())); assertHTMLStripsTo(testBuilder.toString(), gold, null);
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
private void appendChars(StringBuilder testBuilder, int numChars) { private void appendChars(StringBuilder testBuilder, int numChars) {
@ -427,39 +341,19 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
} }
} }
private void processBuffer(String test, String assertMsg) throws IOException {
// System.out.println("-------------------processBuffer----------");
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
test, builder.toString());
}
public void testComment() throws Exception { public void testComment() throws Exception {
String test = "<!--- three dashes, still a valid comment ---> "; String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " "; String gold = " ";
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder(); test = "<! -- blah > "; // should not be recognized as a comment
try { gold = " ";
while ((ch = reader.read()) != -1){ assertHTMLStripsTo(test, gold, null);
builder.append((char)ch);
} StringBuilder testBuilder = new StringBuilder("<!--");
} finally { appendChars(testBuilder, TestUtil.nextInt(random(), 0, 1000));
// System.out.println("String: " + builder.toString()); gold = "";
} assertHTMLStripsTo(testBuilder.toString(), gold, null);
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
@ -526,83 +420,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n" + " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two"; + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
String gold = "onetwo"; String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two"; test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
gold = "one\ntwo"; gold = "one\ntwo";
reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testScriptQuotes() throws Exception { public void testScriptQuotes() throws Exception {
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two"; String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
String gold = "one\ntwo"; String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "hello<script><!-- f('<!--internal--></script>'); --></script>"; test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
gold = "hello\n"; gold = "hello\n";
reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testEscapeScript() throws Exception { public void testEscapeScript() throws Exception {
String test = "one<script no-value-attr>callSomeMethod();</script>two"; String test = "one<script no-value-attr>callSomeMethod();</script>two";
String gold = "one<script no-value-attr></script>two"; String gold = "one<script no-value-attr></script>two";
Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT")); Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT"));
Reader reader = new HTMLStripCharFilter assertHTMLStripsTo(test, gold, escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testStyle() throws Exception { public void testStyle() throws Exception {
@ -612,37 +451,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ "-->\n" + "-->\n"
+ "</style>two"; + "</style>two";
String gold = "one\ntwo"; String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testEscapeStyle() throws Exception { public void testEscapeStyle() throws Exception {
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two"; String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
String gold = "one<style type=\"text/css\"></style>two"; String gold = "one<style type=\"text/css\"></style>two";
Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE")); Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE"));
Reader reader = new HTMLStripCharFilter assertHTMLStripsTo(test, gold, escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testBR() throws Exception { public void testBR() throws Exception {
@ -654,135 +470,80 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
"one\ntwo\n", "one\ntwo\n",
}; };
for (int i = 0 ; i < testGold.length ; i += 2) { for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i]; assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
} }
} }
public void testEscapeBR() throws Exception { public void testEscapeBR() throws Exception {
String test = "one<BR class='whatever'>two</\nBR\n>"; String test = "one<BR class='whatever'>two</\nBR\n>";
String gold = "one<BR class='whatever'>two</\nBR\n>"; String gold = "one<BR class='whatever'>two</\nBR\n>";
Set<String> escapedTags = new HashSet<>(Arrays.asList("BR")); Set<String> escapedTags = new HashSet<>(Arrays.asList("BR"));
Reader reader = new HTMLStripCharFilter assertHTMLStripsTo(test, gold, escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testInlineTagsNoSpace() throws Exception { public void testInlineTagsNoSpace() throws Exception {
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three"; String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
String gold = "onetwo2e.three"; String gold = "onetwo2e.three";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testCDATA() throws Exception { public void testCDATA() throws Exception {
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two"; int maxNumElems = 100;
String gold = "one<one><two>three<four></four></two></one>two"; String randomHtmlishString1 // Don't create a comment (disallow "<!--") and don't include a closing ">"
Reader reader = new HTMLStripCharFilter(new StringReader(test)); = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
int ch = 0; String closedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[&]]>";
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five"; String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
gold = "onetwo<![CDATA[three]]>fourfive"; = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
reader = new HTMLStripCharFilter(new StringReader(test)); String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
ch = 0;
builder = new StringBuilder(); String[] testGold = {
try { "one<![CDATA[<one><two>three<four></four></two></one>]]>two",
while ((ch = reader.read()) != -1){ "one<one><two>three<four></four></two></one>two",
builder.append((char)ch);
} "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five",
} finally { "onetwo<![CDATA[three]]>fourfive",
// System.out.println("String: " + builder.toString());
"<! [CDATA[&]]>", "",
"<! [CDATA[&] ] >", "",
"<! [CDATA[&]]", "<! [CDATA[&]]", // unclosed angle bang - all input is output
"<!\u2009[CDATA[&]]>", "",
"<!\u2009[CDATA[&]\u2009]\u2009>", "",
"<!\u2009[CDATA[&]\u2009]\u2009", "<!\u2009[CDATA[&]\u2009]\u2009", // unclosed angle bang - all input is output
closedAngleBangNonCDATA, "",
"<![CDATA[", "",
"<![CDATA[<br>", "<br>",
"<![CDATA[<br>]]", "<br>]]",
"<![CDATA[<br>]]>", "<br>",
"<![CDATA[<br>] ] >", "<br>] ] >",
"<![CDATA[<br>]\u2009]\u2009>", "<br>]\u2009]\u2009>",
"<!\u2009[CDATA[", "<!\u2009[CDATA[",
unclosedAngleBangNonCDATA, unclosedAngleBangNonCDATA
};
for (int i = 0 ; i < testGold.length ; i += 2) {
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
} }
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'", }
gold, builder.toString());
public void testUnclosedAngleBang() throws Exception {
assertHTMLStripsTo("<![endif]", "<![endif]", null);
} }
public void testUppercaseCharacterEntityVariants() throws Exception { public void testUppercaseCharacterEntityVariants() throws Exception {
String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;"; String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
String gold = " \"-\u00A9>><<\u00AE&"; String gold = " \"-\u00A9>><<\u00AE&";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testMSWordMalformedProcessingInstruction() throws Exception { public void testMSWordMalformedProcessingInstruction() throws Exception {
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two"; String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
String gold = "onetwo"; String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testSupplementaryCharsInTags() throws Exception { public void testSupplementaryCharsInTags() throws Exception {
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven"; String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven"; String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
Reader reader = new HTMLStripCharFilter(new StringReader(test)); assertHTMLStripsTo(test, gold, null);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
} }
public void testRandomBrokenHTML() throws Exception { public void testRandomBrokenHTML() throws Exception {
@ -857,4 +618,33 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } ); assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } ); assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
} }
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags) throws Exception {
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
}
public static void assertHTMLStripsTo(Reader input, String gold, Set<String> escapedTags) throws Exception {
HTMLStripCharFilter reader;
if (null == escapedTags) {
reader = new HTMLStripCharFilter(input);
} else {
reader = new HTMLStripCharFilter(input, escapedTags);
}
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1) {
builder.append((char)ch);
}
} catch (Exception e) {
if (gold.equals(builder.toString())) {
throw e;
}
throw new Exception
("('" + builder.toString() + "' is not equal to '" + gold + "'). " + e.getMessage(), e);
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
} }