LUCENE-10520 / #11556 HTMLStripCharFilter bugfix (#11724)

Add generated HTMLStripCharFilter and update tests to reflect correct char filter behavior
This commit is contained in:
elliotzlin 2023-10-01 23:50:04 -07:00 committed by GitHub
parent 6930b57ff5
commit 4cff584a48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 4591 additions and 31956 deletions

View File

@ -169,7 +169,8 @@ Changes in runtime behavior
Bug Fixes
---------------------
(No changes)
* GITHUB#11556: HTMLStripCharFilter fails on '>' or '<' characters in attribute values. (Elliot Lin)
Build
---------------------

View File

@ -1,6 +1,6 @@
{
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "90cc507377d151340af1631d0f2eca69b6f7de4d",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5c9cf5fd59e8cccdcb2c663be2c02161e0bfc707"
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "d18a049d73068c271657550c33ec614187df778f",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5320cb952146a3f9da93b98067a6ffbbb284dc0f"
}

View File

@ -111,7 +111,8 @@ SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
OpenTagAttribute = {Name} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} | {OpenTagAttribute} )*
InlineElment = ( [aAbBiIqQsSuU] |
[aA][bB][bB][rR] |

View File

@ -17,10 +17,12 @@
package org.apache.lucene.analysis.charfilter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
@ -172,7 +174,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
"",
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
"The <a href=http://www.advancedmd.com>medical practice software",
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
"Levi.com/BMX 2008 Clip of the Week 29...",
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
@ -250,13 +252,13 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
"",
// "<" before ">" inhibits tag recognition
// LUCENE-10520: "<" and ">" in attribute values is valid per the HTML5 spec
"<input type=\"text\" value=\"<search here>\">",
"<input type=\"text\" value=\"\n\">",
"",
"<input type=\"text\" value=\"<search here\">",
"<input type=\"text\" value=\"\n",
"",
"<input type=\"text\" value=\"search here>\">",
"\">",
"",
// "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
@ -632,6 +634,22 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
analyzer.close();
}
/**
* Test that attributes with {@code '>'} or {@code '<'} characters are parsed correctly.
*
* @see <a href="https://github.com/apache/lucene/issues/11556">GITHUB#11556</a>
* @throws IOException on IO error
*/
public void testForIssue10520() throws IOException {
String test =
"<!DOCTYPE html><html lang=\"en\"><head><title>Test</title></head><body><p class=\"foo>bar\" id=\"baz\">Some text.</p></body></html>";
Reader reader = new StringReader(test);
HTMLStripCharFilter filter = new HTMLStripCharFilter(reader);
StringWriter result = new StringWriter();
filter.transferTo(result);
assertEquals("Test\n\n\n\nSome text.", result.toString().trim());
}
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags)
throws Exception {
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);