Add space for certain html tags. This is a fix for bug 19253.

I committed a slightly modified version of Daniel's patch.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150137 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christoph Goller 2003-11-23 17:24:32 +00:00
parent dc4da65595
commit 4e9ed24307
3 changed files with 115 additions and 18 deletions

View File

@ -224,14 +224,18 @@ InterruptedException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
String tagName = t1.image.toLowerCase();
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
inScript = !tagName.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
inScript = tagName.equalsIgnoreCase("<script");
}
label_2:
while (true) {
@ -424,18 +428,18 @@ null)
finally { jj_save(1, xla); }
}
final private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
final private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
final private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
public HTMLParserTokenManager token_source;
SimpleCharStream jj_input_stream;
public Token token, jj_nt;

View File

@ -265,14 +265,18 @@ void Tag() throws IOException :
}
{
t1=<TagName> {
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
String tagName = t1.image.toLowerCase();
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
inScript = !tagName.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
inScript = tagName.equalsIgnoreCase("<script");
}
}
(t1=<ArgName>

View File

@ -0,0 +1,89 @@
package org.apache.lucene.demo.html;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2003 The Apache Software Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class Tags {
/**
* contains all tags for which whitespaces have to be inserted for proper tokenization
*/
public final static Set WS_ELEMS = Collections.synchronizedSet(new HashSet());
static{
WS_ELEMS.add("<p");
WS_ELEMS.add("<div");
WS_ELEMS.add("<hr");
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
WS_ELEMS.add("<br");
WS_ELEMS.add("<br/");
WS_ELEMS.add("<td");
WS_ELEMS.add("<li");
WS_ELEMS.add("<p");
WS_ELEMS.add("<q");
WS_ELEMS.add("<blockquote");
WS_ELEMS.add("<dt");
WS_ELEMS.add("<h1");
WS_ELEMS.add("<h2");
WS_ELEMS.add("<h3");
WS_ELEMS.add("<h4");
WS_ELEMS.add("<h5");
WS_ELEMS.add("<h6");
}
}