mirror of https://github.com/apache/lucene.git
Add generated HTMLStripCharFilter and update tests to reflect correct char filter behavior
This commit is contained in:
parent
6930b57ff5
commit
4cff584a48
|
@ -169,7 +169,8 @@ Changes in runtime behavior
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* GITHUB#11556: HTMLStripCharFilter fails on '>' or '<' characters in attribute values. (Elliot Lin)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "90cc507377d151340af1631d0f2eca69b6f7de4d",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "d18a049d73068c271657550c33ec614187df778f",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5c9cf5fd59e8cccdcb2c663be2c02161e0bfc707"
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "5320cb952146a3f9da93b98067a6ffbbb284dc0f"
|
||||||
}
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -111,7 +111,8 @@ SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
|
||||||
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
|
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
|
||||||
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
|
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
|
||||||
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
|
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
|
||||||
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
|
OpenTagAttribute = {Name} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
|
||||||
|
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} | {OpenTagAttribute} )*
|
||||||
|
|
||||||
InlineElment = ( [aAbBiIqQsSuU] |
|
InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
[aA][bB][bB][rR] |
|
[aA][bB][bB][rR] |
|
||||||
|
|
|
@ -17,10 +17,12 @@
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -172,7 +174,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
|
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
|
||||||
"",
|
"",
|
||||||
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
|
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
|
||||||
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
|
"The <a href=http://www.advancedmd.com>medical practice software",
|
||||||
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
|
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
|
||||||
"Levi.com/BMX 2008 Clip of the Week 29...",
|
"Levi.com/BMX 2008 Clip of the Week 29...",
|
||||||
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
|
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
|
||||||
|
@ -250,13 +252,13 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
|
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
|
||||||
"",
|
"",
|
||||||
|
|
||||||
// "<" before ">" inhibits tag recognition
|
// LUCENE-10520: "<" and ">" in attribute values is valid per the HTML5 spec
|
||||||
"<input type=\"text\" value=\"<search here>\">",
|
"<input type=\"text\" value=\"<search here>\">",
|
||||||
"<input type=\"text\" value=\"\n\">",
|
"",
|
||||||
"<input type=\"text\" value=\"<search here\">",
|
"<input type=\"text\" value=\"<search here\">",
|
||||||
"<input type=\"text\" value=\"\n",
|
"",
|
||||||
"<input type=\"text\" value=\"search here>\">",
|
"<input type=\"text\" value=\"search here>\">",
|
||||||
"\">",
|
"",
|
||||||
|
|
||||||
// "<" and ">" chars are accepted in on[Event] attribute values
|
// "<" and ">" chars are accepted in on[Event] attribute values
|
||||||
"<input type=\"text\" value=\"<search here>\" onFocus=\"this.value='<search here>'\">",
|
"<input type=\"text\" value=\"<search here>\" onFocus=\"this.value='<search here>'\">",
|
||||||
|
@ -632,6 +634,22 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
analyzer.close();
|
analyzer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that attributes with {@code '>'} or {@code '<'} characters are parsed correctly.
|
||||||
|
*
|
||||||
|
* @see <a href="https://github.com/apache/lucene/issues/11556">GITHUB#11556</a>
|
||||||
|
* @throws IOException on IO error
|
||||||
|
*/
|
||||||
|
public void testForIssue10520() throws IOException {
|
||||||
|
String test =
|
||||||
|
"<!DOCTYPE html><html lang=\"en\"><head><title>Test</title></head><body><p class=\"foo>bar\" id=\"baz\">Some text.</p></body></html>";
|
||||||
|
Reader reader = new StringReader(test);
|
||||||
|
HTMLStripCharFilter filter = new HTMLStripCharFilter(reader);
|
||||||
|
StringWriter result = new StringWriter();
|
||||||
|
filter.transferTo(result);
|
||||||
|
assertEquals("Test\n\n\n\nSome text.", result.toString().trim());
|
||||||
|
}
|
||||||
|
|
||||||
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags)
|
public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
|
assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
|
||||||
|
|
Loading…
Reference in New Issue