SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1588136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-04-17 06:08:17 +00:00
parent 99fea41839
commit 2acbcd08cc
4 changed files with 148 additions and 337 deletions

View File

@ -283,6 +283,9 @@ Bug fixes
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
* SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly.
(Dan Funk, Steve Rowe)
Test Framework
* LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss)

View File

@ -31134,7 +31134,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break;
}
case BANG:
@ -31147,7 +31147,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar();
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break;
}
default: {
@ -31506,7 +31506,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
case 84: break;
case 32:
{ yybegin(COMMENT);
{ if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
inputSegment.append(yytext());
} else {
yybegin(COMMENT);
}
}
case 85: break;
case 33:
@ -31611,12 +31615,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
case 99: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
{ if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
inputSegment.append(yytext());
} else {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
}
case 100: break;
case 48:

View File

@ -309,7 +309,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break;
}
case BANG:
@ -322,7 +322,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar();
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
break;
}
default: {
@ -754,7 +754,13 @@ InlineElment = ( [aAbBiIqQsSuU] |
}
<BANG> {
"--" { yybegin(COMMENT); }
"--" {
if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
inputSegment.append(yytext());
} else {
yybegin(COMMENT);
}
}
">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
@ -771,6 +777,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
inputSegment.append(yytext());
} else {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
@ -778,6 +787,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
inputSegment.clear();
yybegin(CDATA);
}
}
[^] {
inputSegment.append(zzBuffer[zzStartRead]);
}

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.charfilter;
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
@ -26,6 +25,7 @@ import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
@ -53,27 +53,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
public void test() throws IOException {
public void test() throws Exception {
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->";
String gold = "\nthis is some text\n here is a link and " +
"another link. " +
"This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
StringBuilder builder = new StringBuilder();
int ch = -1;
char [] goldArray = gold.toCharArray();
int position = 0;
while ((ch = reader.read()) != -1){
char theChar = (char) ch;
builder.append(theChar);
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(html, gold, null);
}
//Some sanity checks, but not a full-fledged check
@ -100,61 +87,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "This is a test";
StringBuilder builder = new StringBuilder();
int ch = 0;
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
// Compare trim()'d output to gold
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
gold, builder.toString().trim());
}
public void testGamma() throws Exception {
String test = "&Gamma;";
String gold = "\u0393";
Set<String> set = new HashSet<>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
assertHTMLStripsTo("&Gamma;", "\u0393", new HashSet<>(Arrays.asList("reserved")));
}
public void testEntities() throws Exception {
String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
Set<String> set = new HashSet<>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
}
public void testMoreEntities() throws Exception {
String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
String gold = " <junk/> ! @ and ";
Set<String> set = new HashSet<>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
}
public void testReserved() throws Exception {
@ -344,16 +298,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
"\n\n\n\n\n\n\n\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
}
}
@ -362,7 +307,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
testBuilder.append("ah<?> ??????");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
Reader reader = new HTMLStripCharFilter
(new BufferedReader(new StringReader(testBuilder.toString()))); //force the use of BufferedReader
assertHTMLStripsTo(reader, testBuilder.toString(), null);
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
@ -370,54 +317,21 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
testBuilder.append("-->foo");
String gold = "foo";
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(testBuilder.toString(), gold, null);
testBuilder.setLength(0);
testBuilder.append("<?");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>");
gold = "";
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(testBuilder.toString(), gold, null);
testBuilder.setLength(0);
testBuilder.append("<b ");
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>");
gold = "";
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(testBuilder.toString(), gold, null);
}
private void appendChars(StringBuilder testBuilder, int numChars) {
@ -427,39 +341,19 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
}
private void processBuffer(String test, String assertMsg) throws IOException {
// System.out.println("-------------------processBuffer----------");
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
test, builder.toString());
}
public void testComment() throws Exception {
String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " ";
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
test = "<! -- blah > "; // should not be recognized as a comment
gold = " ";
assertHTMLStripsTo(test, gold, null);
StringBuilder testBuilder = new StringBuilder("<!--");
appendChars(testBuilder, TestUtil.nextInt(random(), 0, 1000));
gold = "";
assertHTMLStripsTo(testBuilder.toString(), gold, null);
}
@ -526,83 +420,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
assertHTMLStripsTo(test, gold, null);
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
gold = "one\ntwo";
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testScriptQuotes() throws Exception {
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
gold = "hello\n";
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testEscapeScript() throws Exception {
String test = "one<script no-value-attr>callSomeMethod();</script>two";
String gold = "one<script no-value-attr></script>two";
Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT"));
Reader reader = new HTMLStripCharFilter
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, escapedTags);
}
public void testStyle() throws Exception {
@ -612,37 +451,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ "-->\n"
+ "</style>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testEscapeStyle() throws Exception {
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
String gold = "one<style type=\"text/css\"></style>two";
Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE"));
Reader reader = new HTMLStripCharFilter
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, escapedTags);
}
public void testBR() throws Exception {
@ -654,135 +470,80 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
"one\ntwo\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
}
}
public void testEscapeBR() throws Exception {
String test = "one<BR class='whatever'>two</\nBR\n>";
String gold = "one<BR class='whatever'>two</\nBR\n>";
Set<String> escapedTags = new HashSet<>(Arrays.asList("BR"));
Reader reader = new HTMLStripCharFilter
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, escapedTags);
}
public void testInlineTagsNoSpace() throws Exception {
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
String gold = "onetwo2e.three";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testCDATA() throws Exception {
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
String gold = "one<one><two>three<four></four></two></one>two";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
int maxNumElems = 100;
String randomHtmlishString1 // Don't create a comment (disallow "<!--") and don't include a closing ">"
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
String closedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[&]]>";
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
gold = "onetwo<![CDATA[three]]>fourfive";
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
String[] testGold = {
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",
"one<one><two>three<four></four></two></one>two",
"one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five",
"onetwo<![CDATA[three]]>fourfive",
"<! [CDATA[&]]>", "",
"<! [CDATA[&] ] >", "",
"<! [CDATA[&]]", "<! [CDATA[&]]", // unclosed angle bang - all input is output
"<!\u2009[CDATA[&]]>", "",
"<!\u2009[CDATA[&]\u2009]\u2009>", "",
"<!\u2009[CDATA[&]\u2009]\u2009", "<!\u2009[CDATA[&]\u2009]\u2009", // unclosed angle bang - all input is output
closedAngleBangNonCDATA, "",
"<![CDATA[", "",
"<![CDATA[<br>", "<br>",
"<![CDATA[<br>]]", "<br>]]",
"<![CDATA[<br>]]>", "<br>",
"<![CDATA[<br>] ] >", "<br>] ] >",
"<![CDATA[<br>]\u2009]\u2009>", "<br>]\u2009]\u2009>",
"<!\u2009[CDATA[", "<!\u2009[CDATA[",
unclosedAngleBangNonCDATA, unclosedAngleBangNonCDATA
};
for (int i = 0 ; i < testGold.length ; i += 2) {
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
public void testUnclosedAngleBang() throws Exception {
assertHTMLStripsTo("<![endif]", "<![endif]", null);
}
public void testUppercaseCharacterEntityVariants() throws Exception {
String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
String gold = " \"-\u00A9>><<\u00AE&";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testMSWordMalformedProcessingInstruction() throws Exception {
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testSupplementaryCharsInTags() throws Exception {
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
assertHTMLStripsTo(test, gold, null);
}
public void testRandomBrokenHTML() throws Exception {
@ -857,4 +618,33 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
}
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags) throws Exception {
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
}
public static void assertHTMLStripsTo(Reader input, String gold, Set<String> escapedTags) throws Exception {
HTMLStripCharFilter reader;
if (null == escapedTags) {
reader = new HTMLStripCharFilter(input);
} else {
reader = new HTMLStripCharFilter(input, escapedTags);
}
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1) {
builder.append((char)ch);
}
} catch (Exception e) {
if (gold.equals(builder.toString())) {
throw e;
}
throw new Exception
("('" + builder.toString() + "' is not equal to '" + gold + "'). " + e.getMessage(), e);
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
}