Fix for problem with "<" and ">" within script.

Fixes bug #16952


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150138 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christoph Goller 2003-11-23 18:37:32 +00:00
parent 4e9ed24307
commit 41ab0e084c
4 changed files with 843 additions and 565 deletions

View File

@ -16,7 +16,6 @@ public class HTMLParser implements HTMLParserConstants {
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
@ -119,8 +118,6 @@ InterruptedException {
}
void addText(String text) throws IOException {
if (inScript)
return;
if (inStyle)
return;
if (inMetaTag)
@ -147,8 +144,6 @@ InterruptedException {
}
void addSpace() throws IOException {
if (inScript)
return;
if (!afterSpace) {
if (inTitle)
title.append(" ");
@ -167,6 +162,7 @@ InterruptedException {
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptStart:
case TagName:
case DeclName:
case Comment1:
@ -195,6 +191,10 @@ InterruptedException {
CommentTag();
afterTag = true;
break;
case ScriptStart:
ScriptTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image); afterTag = false;
@ -232,11 +232,7 @@ InterruptedException {
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !tagName.equalsIgnoreCase("</script");
} else {
inScript = tagName.equalsIgnoreCase("<script");
}
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
@ -414,6 +410,23 @@ null)
}
}
final public void ScriptTag() throws ParseException {
jj_consume_token(ScriptStart);
label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptText:
;
break;
default:
jj_la1[13] = jj_gen;
break label_6;
}
jj_consume_token(ScriptText);
}
jj_consume_token(ScriptEnd);
}
final private boolean jj_2_1(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_1(); }
@ -428,18 +441,18 @@ null)
finally { jj_save(1, xla); }
}
final private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
final private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
final private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
public HTMLParserTokenManager token_source;
SimpleCharStream jj_input_stream;
public Token token, jj_nt;
@ -449,13 +462,13 @@ null)
public boolean lookingAhead = false;
private boolean jj_semLA;
private int jj_gen;
final private int[] jj_la1 = new int[13];
final private int[] jj_la1 = new int[14];
static private int[] jj_la1_0;
static {
jj_la1_0();
}
private static void jj_la1_0() {
jj_la1_0 = new int[] {0xb3e,0xb3e,0x1000,0x38000,0x2000,0x8000,0x10000,0x20000,0x3b000,0x3b000,0x800000,0x2000000,0x18,};
jj_la1_0 = new int[] {0x167e,0x167e,0x8000,0x1c0000,0x10000,0x40000,0x80000,0x100000,0x1d8000,0x1d8000,0x4000000,0x10000000,0x30,0x2000,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[2];
private boolean jj_rescan = false;
@ -467,7 +480,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -477,7 +490,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -487,7 +500,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -497,7 +510,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -506,7 +519,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -515,7 +528,7 @@ null)
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 13; i++) jj_la1[i] = -1;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
@ -626,15 +639,15 @@ null)
public ParseException generateParseException() {
jj_expentries.removeAllElements();
boolean[] la1tokens = new boolean[27];
for (int i = 0; i < 27; i++) {
boolean[] la1tokens = new boolean[30];
for (int i = 0; i < 30; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
}
for (int i = 0; i < 13; i++) {
for (int i = 0; i < 14; i++) {
if (jj_la1[i] == jj_gen) {
for (int j = 0; j < 32; j++) {
if ((jj_la1_0[i] & (1<<j)) != 0) {
@ -643,7 +656,7 @@ null)
}
}
}
for (int i = 0; i < 27; i++) {
for (int i = 0; i < 30; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;

View File

@ -80,7 +80,6 @@ public class HTMLParser {
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
@ -183,8 +182,6 @@ InterruptedException {
}
void addText(String text) throws IOException {
if (inScript)
return;
if (inStyle)
return;
if (inMetaTag)
@ -211,8 +208,6 @@ InterruptedException {
}
void addSpace() throws IOException {
if (inScript)
return;
if (!afterSpace) {
if (inTitle)
title.append(" ");
@ -248,6 +243,7 @@ void HTMLDocument() throws IOException :
( Tag() { afterTag = true; }
| t=Decl() { afterTag = true; }
| CommentTag() { afterTag = true; }
| ScriptTag() { afterTag = true; }
| t=<Word> { addText(t.image); afterTag = false; }
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
@ -273,11 +269,6 @@ void Tag() throws IOException :
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !tagName.equalsIgnoreCase("</script");
} else {
inScript = tagName.equalsIgnoreCase("<script");
}
}
(t1=<ArgName>
(<ArgEquals>
@ -339,10 +330,17 @@ void CommentTag() :
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
void ScriptTag() :
{}
{
<ScriptStart> ( <ScriptText> )* <ScriptEnd>
}
TOKEN :
{
< TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
< ScriptStart: "<script" > : WithinScript
| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < Comment1: "<!--" > : WithinComment1
@ -361,6 +359,11 @@ TOKEN :
| < Punct: ~[] > // Keep this last. It is a catch-all.
}
<WithinScript> TOKEN:
{
< ScriptText: (~["<",">"])+ | "<" | ">" >
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
}
<WithinTag> TOKEN:
{

View File

@ -4,42 +4,47 @@ package org.apache.lucene.demo.html;
public interface HTMLParserConstants {
int EOF = 0;
int TagName = 1;
int DeclName = 2;
int Comment1 = 3;
int Comment2 = 4;
int Word = 5;
int LET = 6;
int NUM = 7;
int Entity = 8;
int Space = 9;
int SP = 10;
int Punct = 11;
int ArgName = 12;
int ArgEquals = 13;
int TagEnd = 14;
int ArgValue = 15;
int ArgQuote1 = 16;
int ArgQuote2 = 17;
int Quote1Text = 19;
int CloseQuote1 = 20;
int Quote2Text = 21;
int CloseQuote2 = 22;
int CommentText1 = 23;
int CommentEnd1 = 24;
int CommentText2 = 25;
int CommentEnd2 = 26;
int ScriptStart = 1;
int TagName = 2;
int DeclName = 3;
int Comment1 = 4;
int Comment2 = 5;
int Word = 6;
int LET = 7;
int NUM = 8;
int Entity = 9;
int Space = 10;
int SP = 11;
int Punct = 12;
int ScriptText = 13;
int ScriptEnd = 14;
int ArgName = 15;
int ArgEquals = 16;
int TagEnd = 17;
int ArgValue = 18;
int ArgQuote1 = 19;
int ArgQuote2 = 20;
int Quote1Text = 22;
int CloseQuote1 = 23;
int Quote2Text = 24;
int CloseQuote2 = 25;
int CommentText1 = 26;
int CommentEnd1 = 27;
int CommentText2 = 28;
int CommentEnd2 = 29;
int DEFAULT = 0;
int WithinTag = 1;
int AfterEquals = 2;
int WithinQuote1 = 3;
int WithinQuote2 = 4;
int WithinComment1 = 5;
int WithinComment2 = 6;
int WithinScript = 1;
int WithinTag = 2;
int AfterEquals = 3;
int WithinQuote1 = 4;
int WithinQuote2 = 5;
int WithinComment1 = 6;
int WithinComment2 = 7;
String[] tokenImage = {
"<EOF>",
"\"<script\"",
"<TagName>",
"<DeclName>",
"\"<!--\"",
@ -51,13 +56,15 @@ public interface HTMLParserConstants {
"<Space>",
"<SP>",
"<Punct>",
"<ScriptText>",
"<ScriptEnd>",
"<ArgName>",
"\"=\"",
"<TagEnd>",
"<ArgValue>",
"\"\\\'\"",
"\"\\\"\"",
"<token of kind 18>",
"<token of kind 21>",
"<Quote1Text>",
"<CloseQuote1>",
"<Quote2Text>",