mirror of https://github.com/apache/lucene.git
SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1588136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
99fea41839
commit
2acbcd08cc
|
@ -283,6 +283,9 @@ Bug fixes
|
||||||
|
|
||||||
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
|
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
|
||||||
|
|
||||||
|
* SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly.
|
||||||
|
(Dan Funk, Steve Rowe)
|
||||||
|
|
||||||
Test Framework
|
Test Framework
|
||||||
|
|
||||||
* LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss)
|
* LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss)
|
||||||
|
|
|
@ -31134,7 +31134,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||||
// position the correction at (already output length) + (substitution length)
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case BANG:
|
case BANG:
|
||||||
|
@ -31147,7 +31147,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
case LEFT_ANGLE_BRACKET_SLASH:
|
case LEFT_ANGLE_BRACKET_SLASH:
|
||||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
|
@ -31506,7 +31506,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
case 84: break;
|
case 84: break;
|
||||||
case 32:
|
case 32:
|
||||||
{ yybegin(COMMENT);
|
{ if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
|
||||||
|
inputSegment.append(yytext());
|
||||||
|
} else {
|
||||||
|
yybegin(COMMENT);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case 85: break;
|
case 85: break;
|
||||||
case 33:
|
case 33:
|
||||||
|
@ -31611,12 +31615,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
case 99: break;
|
case 99: break;
|
||||||
case 47:
|
case 47:
|
||||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
{ if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
inputSegment.append(yytext());
|
||||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
} else {
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
inputSegment.clear();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
yybegin(CDATA);
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(CDATA);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case 100: break;
|
case 100: break;
|
||||||
case 48:
|
case 48:
|
||||||
|
|
|
@ -309,7 +309,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||||
// position the correction at (already output length) + (substitution length)
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case BANG:
|
case BANG:
|
||||||
|
@ -322,7 +322,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
case LEFT_ANGLE_BRACKET_SLASH:
|
case LEFT_ANGLE_BRACKET_SLASH:
|
||||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
|
@ -754,7 +754,13 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
}
|
}
|
||||||
|
|
||||||
<BANG> {
|
<BANG> {
|
||||||
"--" { yybegin(COMMENT); }
|
"--" {
|
||||||
|
if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
|
||||||
|
inputSegment.append(yytext());
|
||||||
|
} else {
|
||||||
|
yybegin(COMMENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
">" {
|
">" {
|
||||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
@ -771,12 +777,16 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
// [21] CDEnd ::= ']]>'
|
// [21] CDEnd ::= ']]>'
|
||||||
//
|
//
|
||||||
"[CDATA[" {
|
"[CDATA[" {
|
||||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
inputSegment.append(yytext());
|
||||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
} else {
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
inputSegment.clear();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
yybegin(CDATA);
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(CDATA);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
[^] {
|
[^] {
|
||||||
inputSegment.append(zzBuffer[zzStartRead]);
|
inputSegment.append(zzBuffer[zzStartRead]);
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.charfilter;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -26,6 +25,7 @@ import java.io.StringReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -53,27 +53,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
||||||
//
|
//
|
||||||
public void test() throws IOException {
|
public void test() throws Exception {
|
||||||
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
|
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
|
||||||
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
|
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
|
||||||
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
|
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
|
||||||
String gold = "\nthis is some text\n here is a link and " +
|
String gold = "\nthis is some text\n here is a link and " +
|
||||||
"another link. " +
|
"another link. " +
|
||||||
"This is an entity: & plus a <. Here is an &. ";
|
"This is an entity: & plus a <. Here is an &. ";
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
|
assertHTMLStripsTo(html, gold, null);
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = -1;
|
|
||||||
char [] goldArray = gold.toCharArray();
|
|
||||||
int position = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
char theChar = (char) ch;
|
|
||||||
builder.append(theChar);
|
|
||||||
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
|
|
||||||
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
|
|
||||||
position++;
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Some sanity checks, but not a full-fledged check
|
//Some sanity checks, but not a full-fledged check
|
||||||
|
@ -100,61 +87,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "This is a test";
|
String gold = "This is a test";
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString().trim());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void testGamma() throws Exception {
|
|
||||||
String test = "Γ";
|
|
||||||
String gold = "\u0393";
|
|
||||||
Set<String> set = new HashSet<>();
|
|
||||||
set.add("reserved");
|
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
builder.append((char)ch);
|
builder.append((char)ch);
|
||||||
}
|
}
|
||||||
String result = builder.toString();
|
// Compare trim()'d output to gold
|
||||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
|
||||||
|
gold, builder.toString().trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGamma() throws Exception {
|
||||||
|
assertHTMLStripsTo("Γ", "\u0393", new HashSet<>(Arrays.asList("reserved")));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEntities() throws Exception {
|
public void testEntities() throws Exception {
|
||||||
String test = " <foo> Übermensch = Γ bar Γ";
|
String test = " <foo> Übermensch = Γ bar Γ";
|
||||||
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
||||||
Set<String> set = new HashSet<>();
|
assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
|
||||||
set.add("reserved");
|
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
String result = builder.toString();
|
|
||||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMoreEntities() throws Exception {
|
public void testMoreEntities() throws Exception {
|
||||||
String test = " <junk/> ! @ and ’";
|
String test = " <junk/> ! @ and ’";
|
||||||
String gold = " <junk/> ! @ and ’";
|
String gold = " <junk/> ! @ and ’";
|
||||||
Set<String> set = new HashSet<>();
|
assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
|
||||||
set.add("reserved");
|
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
String result = builder.toString();
|
|
||||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReserved() throws Exception {
|
public void testReserved() throws Exception {
|
||||||
|
@ -344,16 +298,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
"\n\n\n\n\n\n\n\n",
|
"\n\n\n\n\n\n\n\n",
|
||||||
};
|
};
|
||||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||||
String test = testGold[i];
|
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
|
||||||
String gold = testGold[i + 1];
|
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
String result = builder.toString();
|
|
||||||
assertEquals("Test: '" + test + "'", gold, result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -362,7 +307,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
|
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
|
||||||
testBuilder.append("ah<?> ??????");
|
testBuilder.append("ah<?> ??????");
|
||||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||||
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
Reader reader = new HTMLStripCharFilter
|
||||||
|
(new BufferedReader(new StringReader(testBuilder.toString()))); //force the use of BufferedReader
|
||||||
|
assertHTMLStripsTo(reader, testBuilder.toString(), null);
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<!--");//comments
|
testBuilder.append("<!--");//comments
|
||||||
|
@ -370,54 +317,21 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
testBuilder.append("-->foo");
|
testBuilder.append("-->foo");
|
||||||
String gold = "foo";
|
String gold = "foo";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
assertHTMLStripsTo(testBuilder.toString(), gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<?");
|
testBuilder.append("<?");
|
||||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||||
testBuilder.append("?>");
|
testBuilder.append("?>");
|
||||||
gold = "";
|
gold = "";
|
||||||
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
assertHTMLStripsTo(testBuilder.toString(), gold, null);
|
||||||
ch = 0;
|
|
||||||
builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<b ");
|
testBuilder.append("<b ");
|
||||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||||
testBuilder.append("/>");
|
testBuilder.append("/>");
|
||||||
gold = "";
|
gold = "";
|
||||||
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
assertHTMLStripsTo(testBuilder.toString(), gold, null);
|
||||||
ch = 0;
|
|
||||||
builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void appendChars(StringBuilder testBuilder, int numChars) {
|
private void appendChars(StringBuilder testBuilder, int numChars) {
|
||||||
|
@ -427,39 +341,19 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void processBuffer(String test, String assertMsg) throws IOException {
|
|
||||||
// System.out.println("-------------------processBuffer----------");
|
|
||||||
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
|
|
||||||
}
|
|
||||||
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
|
|
||||||
test, builder.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testComment() throws Exception {
|
public void testComment() throws Exception {
|
||||||
|
|
||||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||||
String gold = " ";
|
String gold = " ";
|
||||||
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
test = "<! -- blah > "; // should not be recognized as a comment
|
||||||
try {
|
gold = " ";
|
||||||
while ((ch = reader.read()) != -1){
|
assertHTMLStripsTo(test, gold, null);
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
StringBuilder testBuilder = new StringBuilder("<!--");
|
||||||
} finally {
|
appendChars(testBuilder, TestUtil.nextInt(random(), 0, 1000));
|
||||||
// System.out.println("String: " + builder.toString());
|
gold = "";
|
||||||
}
|
assertHTMLStripsTo(testBuilder.toString(), gold, null);
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -526,83 +420,28 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
||||||
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
||||||
String gold = "onetwo";
|
String gold = "onetwo";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
|
|
||||||
|
|
||||||
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
||||||
gold = "one\ntwo";
|
gold = "one\ntwo";
|
||||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
ch = 0;
|
|
||||||
builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testScriptQuotes() throws Exception {
|
public void testScriptQuotes() throws Exception {
|
||||||
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
||||||
String gold = "one\ntwo";
|
String gold = "one\ntwo";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
|
|
||||||
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
||||||
gold = "hello\n";
|
gold = "hello\n";
|
||||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
ch = 0;
|
|
||||||
builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEscapeScript() throws Exception {
|
public void testEscapeScript() throws Exception {
|
||||||
String test = "one<script no-value-attr>callSomeMethod();</script>two";
|
String test = "one<script no-value-attr>callSomeMethod();</script>two";
|
||||||
String gold = "one<script no-value-attr></script>two";
|
String gold = "one<script no-value-attr></script>two";
|
||||||
Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT"));
|
Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
assertHTMLStripsTo(test, gold, escapedTags);
|
||||||
(new StringReader(test), escapedTags);
|
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStyle() throws Exception {
|
public void testStyle() throws Exception {
|
||||||
|
@ -612,37 +451,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
+ "-->\n"
|
+ "-->\n"
|
||||||
+ "</style>two";
|
+ "</style>two";
|
||||||
String gold = "one\ntwo";
|
String gold = "one\ntwo";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEscapeStyle() throws Exception {
|
public void testEscapeStyle() throws Exception {
|
||||||
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
|
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
|
||||||
String gold = "one<style type=\"text/css\"></style>two";
|
String gold = "one<style type=\"text/css\"></style>two";
|
||||||
Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE"));
|
Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
assertHTMLStripsTo(test, gold, escapedTags);
|
||||||
(new StringReader(test), escapedTags);
|
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBR() throws Exception {
|
public void testBR() throws Exception {
|
||||||
|
@ -654,135 +470,80 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
"one\ntwo\n",
|
"one\ntwo\n",
|
||||||
};
|
};
|
||||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||||
String test = testGold[i];
|
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
|
||||||
String gold = testGold[i + 1];
|
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
int ch = 0;
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
String result = builder.toString();
|
|
||||||
assertEquals("Test: '" + test + "'", gold, result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public void testEscapeBR() throws Exception {
|
public void testEscapeBR() throws Exception {
|
||||||
String test = "one<BR class='whatever'>two</\nBR\n>";
|
String test = "one<BR class='whatever'>two</\nBR\n>";
|
||||||
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
||||||
Set<String> escapedTags = new HashSet<>(Arrays.asList("BR"));
|
Set<String> escapedTags = new HashSet<>(Arrays.asList("BR"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
assertHTMLStripsTo(test, gold, escapedTags);
|
||||||
(new StringReader(test), escapedTags);
|
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInlineTagsNoSpace() throws Exception {
|
public void testInlineTagsNoSpace() throws Exception {
|
||||||
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
||||||
String gold = "onetwo2e.three";
|
String gold = "onetwo2e.three";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCDATA() throws Exception {
|
public void testCDATA() throws Exception {
|
||||||
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
int maxNumElems = 100;
|
||||||
String gold = "one<one><two>three<four></four></two></one>two";
|
String randomHtmlishString1 // Don't create a comment (disallow "<!--") and don't include a closing ">"
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
|
||||||
int ch = 0;
|
String closedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[&]]>";
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
|
|
||||||
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
|
||||||
gold = "onetwo<![CDATA[three]]>fourfive";
|
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
|
||||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
|
||||||
ch = 0;
|
|
||||||
builder = new StringBuilder();
|
String[] testGold = {
|
||||||
try {
|
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",
|
||||||
while ((ch = reader.read()) != -1){
|
"one<one><two>three<four></four></two></one>two",
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
"one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five",
|
||||||
} finally {
|
"onetwo<![CDATA[three]]>fourfive",
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
|
"<! [CDATA[&]]>", "",
|
||||||
|
"<! [CDATA[&] ] >", "",
|
||||||
|
"<! [CDATA[&]]", "<! [CDATA[&]]", // unclosed angle bang - all input is output
|
||||||
|
"<!\u2009[CDATA[&]]>", "",
|
||||||
|
"<!\u2009[CDATA[&]\u2009]\u2009>", "",
|
||||||
|
"<!\u2009[CDATA[&]\u2009]\u2009", "<!\u2009[CDATA[&]\u2009]\u2009", // unclosed angle bang - all input is output
|
||||||
|
closedAngleBangNonCDATA, "",
|
||||||
|
"<![CDATA[", "",
|
||||||
|
"<![CDATA[<br>", "<br>",
|
||||||
|
"<![CDATA[<br>]]", "<br>]]",
|
||||||
|
"<![CDATA[<br>]]>", "<br>",
|
||||||
|
"<![CDATA[<br>] ] >", "<br>] ] >",
|
||||||
|
"<![CDATA[<br>]\u2009]\u2009>", "<br>]\u2009]\u2009>",
|
||||||
|
"<!\u2009[CDATA[", "<!\u2009[CDATA[",
|
||||||
|
unclosedAngleBangNonCDATA, unclosedAngleBangNonCDATA
|
||||||
|
};
|
||||||
|
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||||
|
assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
|
||||||
}
|
}
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
}
|
||||||
gold, builder.toString());
|
|
||||||
|
public void testUnclosedAngleBang() throws Exception {
|
||||||
|
assertHTMLStripsTo("<![endif]", "<![endif]", null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUppercaseCharacterEntityVariants() throws Exception {
|
public void testUppercaseCharacterEntityVariants() throws Exception {
|
||||||
String test = " "-©>><<®&";
|
String test = " "-©>><<®&";
|
||||||
String gold = " \"-\u00A9>><<\u00AE&";
|
String gold = " \"-\u00A9>><<\u00AE&";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
||||||
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
||||||
String gold = "onetwo";
|
String gold = "onetwo";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSupplementaryCharsInTags() throws Exception {
|
public void testSupplementaryCharsInTags() throws Exception {
|
||||||
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
||||||
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
assertHTMLStripsTo(test, gold, null);
|
||||||
int ch = 0;
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
try {
|
|
||||||
while ((ch = reader.read()) != -1){
|
|
||||||
builder.append((char)ch);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
// System.out.println("String: " + builder.toString());
|
|
||||||
}
|
|
||||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
|
||||||
gold, builder.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomBrokenHTML() throws Exception {
|
public void testRandomBrokenHTML() throws Exception {
|
||||||
|
@ -857,4 +618,33 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||||
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags) throws Exception {
|
||||||
|
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertHTMLStripsTo(Reader input, String gold, Set<String> escapedTags) throws Exception {
|
||||||
|
HTMLStripCharFilter reader;
|
||||||
|
if (null == escapedTags) {
|
||||||
|
reader = new HTMLStripCharFilter(input);
|
||||||
|
} else {
|
||||||
|
reader = new HTMLStripCharFilter(input, escapedTags);
|
||||||
|
}
|
||||||
|
int ch = 0;
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
try {
|
||||||
|
while ((ch = reader.read()) != -1) {
|
||||||
|
builder.append((char)ch);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
if (gold.equals(builder.toString())) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
throw new Exception
|
||||||
|
("('" + builder.toString() + "' is not equal to '" + gold + "'). " + e.getMessage(), e);
|
||||||
|
}
|
||||||
|
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||||
|
gold, builder.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue