LUCENE-10520 / #11556 HTMLStripCharFilter bugfix (#11724)

Add generated HTMLStripCharFilter and update tests to reflect correct char filter behavior
This commit is contained in:
elliotzlin 2023-10-01 23:50:04 -07:00 committed by GitHub
parent 6930b57ff5
commit 4cff584a48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 4591 additions and 31956 deletions

View File

@ -169,7 +169,8 @@ Changes in runtime behavior
Bug Fixes Bug Fixes
--------------------- ---------------------
(No changes)
* GITHUB#11556: HTMLStripCharFilter fails on '>' or '<' characters in attribute values. (Elliot Lin)
Build Build
--------------------- ---------------------

View File

@ -1,6 +1,6 @@
{ {
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390", "gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d", "lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "90cc507377d151340af1631d0f2eca69b6f7de4d", "lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "d18a049d73068c271657550c33ec614187df778f",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5c9cf5fd59e8cccdcb2c663be2c02161e0bfc707" "lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5320cb952146a3f9da93b98067a6ffbbb284dc0f"
} }

View File

@ -111,7 +111,8 @@ SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" ) DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" ) ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} ) EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )* OpenTagAttribute = {Name} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} | {OpenTagAttribute} )*
InlineElment = ( [aAbBiIqQsSuU] | InlineElment = ( [aAbBiIqQsSuU] |
[aA][bB][bB][rR] | [aA][bB][bB][rR] |

View File

@ -17,10 +17,12 @@
package org.apache.lucene.analysis.charfilter; package org.apache.lucene.analysis.charfilter;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
@ -172,7 +174,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>", "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
"", "",
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>", "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
"The <a href=medical\">http://www.advancedmd.com>medical practice software", "The <a href=http://www.advancedmd.com>medical practice software",
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...", "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
"Levi.com/BMX 2008 Clip of the Week 29...", "Levi.com/BMX 2008 Clip of the Week 29...",
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly", "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
@ -250,13 +252,13 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">", "<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
"", "",
// "<" before ">" inhibits tag recognition // LUCENE-10520: "<" and ">" in attribute values is valid per the HTML5 spec
"<input type=\"text\" value=\"<search here>\">", "<input type=\"text\" value=\"<search here>\">",
"<input type=\"text\" value=\"\n\">", "",
"<input type=\"text\" value=\"<search here\">", "<input type=\"text\" value=\"<search here\">",
"<input type=\"text\" value=\"\n", "",
"<input type=\"text\" value=\"search here>\">", "<input type=\"text\" value=\"search here>\">",
"\">", "",
// "<" and ">" chars are accepted in on[Event] attribute values // "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">", "<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
@ -632,6 +634,22 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
analyzer.close(); analyzer.close();
} }
/**
* Test that attributes with {@code '>'} or {@code '<'} characters are parsed correctly.
*
* @see <a href="https://github.com/apache/lucene/issues/11556">GITHUB#11556</a>
* @throws IOException on IO error
*/
public void testForIssue10520() throws IOException {
String test =
"<!DOCTYPE html><html lang=\"en\"><head><title>Test</title></head><body><p class=\"foo>bar\" id=\"baz\">Some text.</p></body></html>";
Reader reader = new StringReader(test);
HTMLStripCharFilter filter = new HTMLStripCharFilter(reader);
StringWriter result = new StringWriter();
filter.transferTo(result);
assertEquals("Test\n\n\n\nSome text.", result.toString().trim());
}
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags) public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags)
throws Exception { throws Exception {
assertHTMLStripsTo(new StringReader(input), gold, escapedTags); assertHTMLStripsTo(new StringReader(input), gold, escapedTags);