mirror of
https://github.com/apache/lucene.git
synced 2025-02-20 17:07:09 +00:00
Add space for certain html tags. This is a fix for bug 19253.
I committed a slightly modified version of Daniel's patch. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150137 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc4da65595
commit
4e9ed24307
@ -224,14 +224,18 @@ InterruptedException {
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
t1 = jj_consume_token(TagName);
|
||||
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
String tagName = t1.image.toLowerCase();
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
if (inScript) { // keep track if in <SCRIPT>
|
||||
inScript = !t1.image.equalsIgnoreCase("</script");
|
||||
inScript = !tagName.equalsIgnoreCase("</script");
|
||||
} else {
|
||||
inScript = t1.image.equalsIgnoreCase("<script");
|
||||
inScript = tagName.equalsIgnoreCase("<script");
|
||||
}
|
||||
label_2:
|
||||
while (true) {
|
||||
@ -424,18 +428,18 @@ null)
|
||||
finally { jj_save(1, xla); }
|
||||
}
|
||||
|
||||
final private boolean jj_3_2() {
|
||||
if (jj_scan_token(ArgQuote2)) return true;
|
||||
if (jj_scan_token(CloseQuote2)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
final private boolean jj_3_1() {
|
||||
if (jj_scan_token(ArgQuote1)) return true;
|
||||
if (jj_scan_token(CloseQuote1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
final private boolean jj_3_2() {
|
||||
if (jj_scan_token(ArgQuote2)) return true;
|
||||
if (jj_scan_token(CloseQuote2)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public HTMLParserTokenManager token_source;
|
||||
SimpleCharStream jj_input_stream;
|
||||
public Token token, jj_nt;
|
||||
|
@ -265,14 +265,18 @@ void Tag() throws IOException :
|
||||
}
|
||||
{
|
||||
t1=<TagName> {
|
||||
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
String tagName = t1.image.toLowerCase();
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
if (inScript) { // keep track if in <SCRIPT>
|
||||
inScript = !t1.image.equalsIgnoreCase("</script");
|
||||
inScript = !tagName.equalsIgnoreCase("</script");
|
||||
} else {
|
||||
inScript = t1.image.equalsIgnoreCase("<script");
|
||||
inScript = tagName.equalsIgnoreCase("<script");
|
||||
}
|
||||
}
|
||||
(t1=<ArgName>
|
||||
|
89
src/demo/org/apache/lucene/demo/html/Tags.java
Normal file
89
src/demo/org/apache/lucene/demo/html/Tags.java
Normal file
@ -0,0 +1,89 @@
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2003 The Apache Software Foundation. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public final class Tags {
|
||||
|
||||
/**
|
||||
* contains all tags for which whitespaces have to be inserted for proper tokenization
|
||||
*/
|
||||
public final static Set WS_ELEMS = Collections.synchronizedSet(new HashSet());
|
||||
|
||||
static{
|
||||
WS_ELEMS.add("<p");
|
||||
WS_ELEMS.add("<div");
|
||||
WS_ELEMS.add("<hr");
|
||||
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
|
||||
WS_ELEMS.add("<br");
|
||||
WS_ELEMS.add("<br/");
|
||||
WS_ELEMS.add("<td");
|
||||
WS_ELEMS.add("<li");
|
||||
WS_ELEMS.add("<p");
|
||||
WS_ELEMS.add("<q");
|
||||
WS_ELEMS.add("<blockquote");
|
||||
WS_ELEMS.add("<dt");
|
||||
WS_ELEMS.add("<h1");
|
||||
WS_ELEMS.add("<h2");
|
||||
WS_ELEMS.add("<h3");
|
||||
WS_ELEMS.add("<h4");
|
||||
WS_ELEMS.add("<h5");
|
||||
WS_ELEMS.add("<h6");
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user