SOLR-1343: Added HTMLStripCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@802263 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2009-08-07 23:05:05 +00:00
parent 05f000d36a
commit 86dd3a6cda
8 changed files with 1442 additions and 1348 deletions

View File

@ -258,6 +258,10 @@ New Features
RequestHandler or SearchComponent to know when a newSearcher or firstSearcher event happened. QuerySenderListender is the only implementation RequestHandler or SearchComponent to know when a newSearcher or firstSearcher event happened. QuerySenderListender is the only implementation
in Solr that implements this, but outside implementations may wish to. See the AbstractSolrEventListener for a helper method. (gsingers) in Solr that implements this, but outside implementations may wish to. See the AbstractSolrEventListener for a helper method. (gsingers)
67. SOLR-1343: Added HTMLStripCharFilter and marked HTMLStripReader, HTMLStripWhitespaceTokenizerFactory and
HTMLStripStandardTokenizerFactory deprecated. To strip HTML tags, HTMLStripCharFilter can be used
with an arbitrary Tokenizer. (koji)
Optimizations Optimizations
---------------------- ----------------------

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.CharStream;
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
public HTMLStripCharFilter create(CharStream input) {
return new HTMLStripCharFilter(input);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,9 @@ import java.io.Reader;
/** /**
* @version $Id$ * @version $Id$
* @deprecated Use {@link HTMLStripCharFilterFactory} and {@link StandardTokenizerFactory}
*/ */
@Deprecated
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory { public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) { public TokenStream create(Reader input) {
return new StandardTokenizer(new HTMLStripReader(input)); return new StandardTokenizer(new HTMLStripReader(input));

View File

@ -24,7 +24,9 @@ import java.io.Reader;
/** /**
* @version $Id$ * @version $Id$
* @deprecated Use {@link HTMLStripCharFilterFactory} and {@link WhitespaceTokenizerFactory}
*/ */
@Deprecated
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory { public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) { public TokenStream create(Reader input) {
return new WhitespaceTokenizer(new HTMLStripReader(input)); return new WhitespaceTokenizer(new HTMLStripReader(input));

View File

@ -1,11 +1,12 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
/** /**
* Copyright 2004 The Apache Software Foundation * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* Licensed under the Apache License, Version 2.0 (the "License"); * this work for additional information regarding copyright ownership.
* you may not use this file except in compliance with the License. * The ASF licenses this file to You under the Apache License, Version 2.0
* You may obtain a copy of the License at * (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
@ -16,22 +17,23 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.CharReader;
import junit.framework.TestCase; import junit.framework.TestCase;
import java.io.StringReader; public class HTMLStripCharFilterTest extends TestCase {
import java.io.IOException;
import java.io.FileReader;
import java.io.File;
import java.io.Reader;
import java.io.BufferedReader;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
public class HTMLStripReaderTest extends TestCase {
public HTMLStripReaderTest(String s) { public HTMLStripCharFilterTest(String s) {
super(s); super(s);
} }
@ -50,7 +52,7 @@ public class HTMLStripReaderTest extends TestCase {
String gold = " this is some text here is a link and " + String gold = " this is some text here is a link and " +
"another link . " + "another link . " +
"This is an entity: & plus a < . Here is an &. "; "This is an entity: & plus a < . Here is an &. ";
HTMLStripReader reader = new HTMLStripReader(new StringReader(html)); HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = -1; int ch = -1;
char [] goldArray = gold.toCharArray(); char [] goldArray = gold.toCharArray();
@ -68,7 +70,7 @@ public class HTMLStripReaderTest extends TestCase {
//Some sanity checks, but not a full-fledged check //Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception { public void testHTML() throws Exception {
HTMLStripReader reader = new HTMLStripReader(new FileReader(new File("htmlStripReaderTest.html"))); HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(new File("htmlStripReaderTest.html"))));
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = -1; int ch = -1;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
@ -88,7 +90,7 @@ public class HTMLStripReaderTest extends TestCase {
String gold = "\u0393 "; String gold = "\u0393 ";
Set<String> set = new HashSet<String>(); Set<String> set = new HashSet<String>();
set.add("reserved"); set.add("reserved");
Reader reader = new HTMLStripReader(new StringReader(test), set); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = 0; int ch = 0;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
@ -105,7 +107,7 @@ public class HTMLStripReaderTest extends TestCase {
String gold = " < foo> = \u0393 bar \u0393 "; String gold = " < foo> = \u0393 bar \u0393 ";
Set<String> set = new HashSet<String>(); Set<String> set = new HashSet<String>();
set.add("reserved"); set.add("reserved");
Reader reader = new HTMLStripReader(new StringReader(test), set); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = 0; int ch = 0;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
@ -122,7 +124,7 @@ public class HTMLStripReaderTest extends TestCase {
String gold = " < junk/> ! @ and "; String gold = " < junk/> ! @ and ";
Set<String> set = new HashSet<String>(); Set<String> set = new HashSet<String>();
set.add("reserved"); set.add("reserved");
Reader reader = new HTMLStripReader(new StringReader(test), set); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = 0; int ch = 0;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
@ -138,7 +140,7 @@ public class HTMLStripReaderTest extends TestCase {
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
Set<String> set = new HashSet<String>(); Set<String> set = new HashSet<String>();
set.add("reserved"); set.add("reserved");
Reader reader = new HTMLStripReader(new StringReader(test), set); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
int ch = 0; int ch = 0;
while ((ch = reader.read()) != -1){ while ((ch = reader.read()) != -1){
@ -195,7 +197,7 @@ public class HTMLStripReaderTest extends TestCase {
"<ttt>[aaaa://aaaaa.aaaaaaaa.aaa/aaaaaaa/aaaaaa/ aaaaaaa aaaaaa] aaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa, aaaaa aaaaaaaaa aaa [[aa aaaaaaa]] [[aaaa]]; aaaaaaaaaaa aaaaaaaa aaa [[aaa aa]] [[aaaa]]</ttt>\n" + "<ttt>[aaaa://aaaaa.aaaaaaaa.aaa/aaaaaaa/aaaaaa/ aaaaaaa aaaaaa] aaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa, aaaaa aaaaaaaaa aaa [[aa aaaaaaa]] [[aaaa]]; aaaaaaaaaaa aaaaaaaa aaa [[aaa aa]] [[aaaa]]</ttt>\n" +
"\n" + "\n" +
"[[aaaaa aaaaaaaaaa]] aa ''[[aaa aaaaaaaaaaaa aa aaaaaa]]'' (aaaa) aaaa aaa aaaa [[zzzzzzz]] aa aaaaaaaa"; "[[aaaaa aaaaaaaaaa]] aa ''[[aaa aaaaaaaaaaaa aa aaaaaa]]'' (aaaa) aaaa aaa aaaa [[zzzzzzz]] aa aaaaaaaa";
Reader reader = new HTMLStripReader(new StringReader(test)); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader noStrip = new StringReader(test); Reader noStrip = new StringReader(test);
int ch = 0; int ch = 0;
int ch2 = 0; int ch2 = 0;
@ -210,27 +212,27 @@ public class HTMLStripReaderTest extends TestCase {
} }
public void testBufferOverflow() throws Exception { public void testBufferOverflow() throws Exception {
StringBuilder testBuilder = new StringBuilder(HTMLStripReader.DEFAULT_READ_AHEAD + 50); StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
testBuilder.append("ah<?> "); testBuilder.append("ah<?> ");
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<!--");//comments testBuilder.append("<!--");//comments
appendChars(testBuilder, 3*HTMLStripReader.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
testBuilder.append("-->foo"); testBuilder.append("-->foo");
processBuffer(testBuilder.toString(), "Failed w/ comment"); processBuffer(testBuilder.toString(), "Failed w/ comment");
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<?"); testBuilder.append("<?");
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("?>"); testBuilder.append("?>");
processBuffer(testBuilder.toString(), "Failed with proc. instr."); processBuffer(testBuilder.toString(), "Failed with proc. instr.");
testBuilder.setLength(0); testBuilder.setLength(0);
testBuilder.append("<b "); testBuilder.append("<b ");
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("/>"); testBuilder.append("/>");
processBuffer(testBuilder.toString(), "Failed on tag"); processBuffer(testBuilder.toString(), "Failed on tag");
@ -239,14 +241,14 @@ public class HTMLStripReaderTest extends TestCase {
private void appendChars(StringBuilder testBuilder, int numChars) { private void appendChars(StringBuilder testBuilder, int numChars) {
int i1 = numChars / 2; int i1 = numChars / 2;
for (int i = 0; i < i1; i++){ for (int i = 0; i < i1; i++){
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripReader think it is a processing instruction testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripCharFilter think it is a processing instruction
} }
} }
private void processBuffer(String test, String assertMsg) throws IOException { private void processBuffer(String test, String assertMsg) throws IOException {
System.out.println("-------------------processBuffer----------"); System.out.println("-------------------processBuffer----------");
Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0; int ch = 0;
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
try { try {
@ -263,7 +265,7 @@ public class HTMLStripReaderTest extends TestCase {
String test = "<!--- three dashes, still a valid comment ---> "; String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " "; String gold = " ";
Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0; int ch = 0;
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
try { try {
@ -276,4 +278,4 @@ public class HTMLStripReaderTest extends TestCase {
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true); assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
} }
} }

View File

@ -174,7 +174,6 @@
<%! <%!
private static void doAnalyzer(JspWriter out, SchemaField field, String val, boolean queryAnalyser, boolean verbose, Set<Tok> match) throws Exception { private static void doAnalyzer(JspWriter out, SchemaField field, String val, boolean queryAnalyser, boolean verbose, Set<Tok> match) throws Exception {
CharStream reader = CharReader.get(new StringReader(val));
FieldType ft = field.getType(); FieldType ft = field.getType();
Analyzer analyzer = queryAnalyser ? Analyzer analyzer = queryAnalyser ?
@ -186,18 +185,18 @@
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories(); TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
if( cfiltfacs != null ){ if( cfiltfacs != null ){
String source = val;
for(CharFilterFactory cfiltfac : cfiltfacs ){ for(CharFilterFactory cfiltfac : cfiltfacs ){
CharStream reader = CharReader.get(new StringReader(source));
reader = cfiltfac.create(reader); reader = cfiltfac.create(reader);
if(verbose){ if(verbose){
writeHeader(out, cfiltfac.getClass(), cfiltfac.getArgs()); writeHeader(out, cfiltfac.getClass(), cfiltfac.getArgs());
writeCharStream(out, reader); source = writeCharStream(out, reader);
} }
} }
} }
// StringReader should support reset() TokenStream tstream = tfac.create(tchain.charStream(new StringReader(val)));
reader.reset();
TokenStream tstream = tfac.create(reader);
List<Token> tokens = getTokens(tstream); List<Token> tokens = getTokens(tstream);
if (verbose) { if (verbose) {
writeHeader(out, tfac.getClass(), tfac.getArgs()); writeHeader(out, tfac.getClass(), tfac.getArgs());
@ -223,7 +222,7 @@
} }
} else { } else {
TokenStream tstream = analyzer.tokenStream(field.getName(),reader); TokenStream tstream = analyzer.tokenStream(field.getName(),new StringReader(val));
List<Token> tokens = getTokens(tstream); List<Token> tokens = getTokens(tstream);
if (verbose) { if (verbose) {
writeHeader(out, analyzer.getClass(), new HashMap<String,String>()); writeHeader(out, analyzer.getClass(), new HashMap<String,String>());
@ -468,7 +467,7 @@
out.println("</table>"); out.println("</table>");
} }
static void writeCharStream(JspWriter out, CharStream input) throws IOException { static String writeCharStream(JspWriter out, CharStream input) throws IOException {
out.println("<table width=\"auto\" class=\"analysis\" border=\"1\">"); out.println("<table width=\"auto\" class=\"analysis\" border=\"1\">");
out.println("<tr>"); out.println("<tr>");
@ -476,8 +475,6 @@
XML.escapeCharData("text",out); XML.escapeCharData("text",out);
out.println("</th>"); out.println("</th>");
// StringReader should support reset()
input.reset();
final int BUFFER_SIZE = 1024; final int BUFFER_SIZE = 1024;
char[] buf = new char[BUFFER_SIZE]; char[] buf = new char[BUFFER_SIZE];
int len = 0; int len = 0;
@ -492,6 +489,7 @@
out.println("</tr>"); out.println("</tr>");
out.println("</table>"); out.println("</table>");
return sb.toString();
} }
%> %>