mirror of https://github.com/apache/lucene.git
SOLR-1343: Added HTMLStripCharFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@802263 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
05f000d36a
commit
86dd3a6cda
|
@ -258,6 +258,10 @@ New Features
|
||||||
RequestHandler or SearchComponent to know when a newSearcher or firstSearcher event happened. QuerySenderListender is the only implementation
|
RequestHandler or SearchComponent to know when a newSearcher or firstSearcher event happened. QuerySenderListender is the only implementation
|
||||||
in Solr that implements this, but outside implementations may wish to. See the AbstractSolrEventListener for a helper method. (gsingers)
|
in Solr that implements this, but outside implementations may wish to. See the AbstractSolrEventListener for a helper method. (gsingers)
|
||||||
|
|
||||||
|
67. SOLR-1343: Added HTMLStripCharFilter and marked HTMLStripReader, HTMLStripWhitespaceTokenizerFactory and
|
||||||
|
HTMLStripStandardTokenizerFactory deprecated. To strip HTML tags, HTMLStripCharFilter can be used
|
||||||
|
with an arbitrary Tokenizer. (koji)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,29 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
|
||||||
|
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
|
||||||
|
|
||||||
|
public HTMLStripCharFilter create(CharStream input) {
|
||||||
|
return new HTMLStripCharFilter(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,9 @@ import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
* @deprecated Use {@link HTMLStripCharFilterFactory} and {@link StandardTokenizerFactory}
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
|
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public TokenStream create(Reader input) {
|
public TokenStream create(Reader input) {
|
||||||
return new StandardTokenizer(new HTMLStripReader(input));
|
return new StandardTokenizer(new HTMLStripReader(input));
|
||||||
|
|
|
@ -24,7 +24,9 @@ import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
* @deprecated Use {@link HTMLStripCharFilterFactory} and {@link WhitespaceTokenizerFactory}
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public TokenStream create(Reader input) {
|
public TokenStream create(Reader input) {
|
||||||
return new WhitespaceTokenizer(new HTMLStripReader(input));
|
return new WhitespaceTokenizer(new HTMLStripReader(input));
|
||||||
|
|
66
src/test/org/apache/solr/analysis/HTMLStripReaderTest.java → src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
Executable file → Normal file
66
src/test/org/apache/solr/analysis/HTMLStripReaderTest.java → src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
Executable file → Normal file
|
@ -1,11 +1,12 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copyright 2004 The Apache Software Foundation
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
*
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* this work for additional information regarding copyright ownership.
|
||||||
* you may not use this file except in compliance with the License.
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
* You may obtain a copy of the License at
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
*
|
*
|
||||||
|
@ -16,22 +17,23 @@ package org.apache.solr.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import java.io.StringReader;
|
public class HTMLStripCharFilterTest extends TestCase {
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.FileReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
public class HTMLStripReaderTest extends TestCase {
|
|
||||||
|
|
||||||
|
|
||||||
public HTMLStripReaderTest(String s) {
|
public HTMLStripCharFilterTest(String s) {
|
||||||
super(s);
|
super(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,7 +52,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
String gold = " this is some text here is a link and " +
|
String gold = " this is some text here is a link and " +
|
||||||
"another link . " +
|
"another link . " +
|
||||||
"This is an entity: & plus a < . Here is an &. ";
|
"This is an entity: & plus a < . Here is an &. ";
|
||||||
HTMLStripReader reader = new HTMLStripReader(new StringReader(html));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = -1;
|
int ch = -1;
|
||||||
char [] goldArray = gold.toCharArray();
|
char [] goldArray = gold.toCharArray();
|
||||||
|
@ -68,7 +70,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
//Some sanity checks, but not a full-fledged check
|
//Some sanity checks, but not a full-fledged check
|
||||||
public void testHTML() throws Exception {
|
public void testHTML() throws Exception {
|
||||||
|
|
||||||
HTMLStripReader reader = new HTMLStripReader(new FileReader(new File("htmlStripReaderTest.html")));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(new File("htmlStripReaderTest.html"))));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = -1;
|
int ch = -1;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -88,7 +90,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
String gold = "\u0393 ";
|
String gold = "\u0393 ";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripReader(new StringReader(test), set);
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -105,7 +107,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
String gold = " < foo> = \u0393 bar \u0393 ";
|
String gold = " < foo> = \u0393 bar \u0393 ";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripReader(new StringReader(test), set);
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -122,7 +124,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
String gold = " < junk/> ! @ and ’ ";
|
String gold = " < junk/> ! @ and ’ ";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripReader(new StringReader(test), set);
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -138,7 +140,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripReader(new StringReader(test), set);
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -195,7 +197,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
"<ttt>[aaaa://aaaaa.aaaaaaaa.aaa/aaaaaaa/aaaaaa/ aaaaaaa aaaaaa] aaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa, aaaaa aaaaaaaaa aaa [[aa aaaaaaa]] [[aaaa]]; aaaaaaaaaaa aaaaaaaa aaa [[aaa aa]] [[aaaa]]</ttt>\n" +
|
"<ttt>[aaaa://aaaaa.aaaaaaaa.aaa/aaaaaaa/aaaaaa/ aaaaaaa aaaaaa] aaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa, aaaaa aaaaaaaaa aaa [[aa aaaaaaa]] [[aaaa]]; aaaaaaaaaaa aaaaaaaa aaa [[aaa aa]] [[aaaa]]</ttt>\n" +
|
||||||
"\n" +
|
"\n" +
|
||||||
"[[aaaaa aaaaaaaaaa]] aa ''[[aaa aaaaaaaaaaaa aa aaaaaa]]'' (aaaa) aaaa aaa aaaa [[zzzzzzz]] aa aaaaaaaa";
|
"[[aaaaa aaaaaaaaaa]] aa ''[[aaa aaaaaaaaaaaa aa aaaaaa]]'' (aaaa) aaaa aaa aaaa [[zzzzzzz]] aa aaaaaaaa";
|
||||||
Reader reader = new HTMLStripReader(new StringReader(test));
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||||
Reader noStrip = new StringReader(test);
|
Reader noStrip = new StringReader(test);
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
int ch2 = 0;
|
int ch2 = 0;
|
||||||
|
@ -210,27 +212,27 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBufferOverflow() throws Exception {
|
public void testBufferOverflow() throws Exception {
|
||||||
StringBuilder testBuilder = new StringBuilder(HTMLStripReader.DEFAULT_READ_AHEAD + 50);
|
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
|
||||||
testBuilder.append("ah<?> ");
|
testBuilder.append("ah<?> ");
|
||||||
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||||
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<!--");//comments
|
testBuilder.append("<!--");//comments
|
||||||
appendChars(testBuilder, 3*HTMLStripReader.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
|
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
|
||||||
|
|
||||||
testBuilder.append("-->foo");
|
testBuilder.append("-->foo");
|
||||||
processBuffer(testBuilder.toString(), "Failed w/ comment");
|
processBuffer(testBuilder.toString(), "Failed w/ comment");
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<?");
|
testBuilder.append("<?");
|
||||||
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||||
testBuilder.append("?>");
|
testBuilder.append("?>");
|
||||||
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
|
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
|
||||||
|
|
||||||
testBuilder.setLength(0);
|
testBuilder.setLength(0);
|
||||||
testBuilder.append("<b ");
|
testBuilder.append("<b ");
|
||||||
appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||||
testBuilder.append("/>");
|
testBuilder.append("/>");
|
||||||
processBuffer(testBuilder.toString(), "Failed on tag");
|
processBuffer(testBuilder.toString(), "Failed on tag");
|
||||||
|
|
||||||
|
@ -239,14 +241,14 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
private void appendChars(StringBuilder testBuilder, int numChars) {
|
private void appendChars(StringBuilder testBuilder, int numChars) {
|
||||||
int i1 = numChars / 2;
|
int i1 = numChars / 2;
|
||||||
for (int i = 0; i < i1; i++){
|
for (int i = 0; i < i1; i++){
|
||||||
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripReader think it is a processing instruction
|
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripCharFilter think it is a processing instruction
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void processBuffer(String test, String assertMsg) throws IOException {
|
private void processBuffer(String test, String assertMsg) throws IOException {
|
||||||
System.out.println("-------------------processBuffer----------");
|
System.out.println("-------------------processBuffer----------");
|
||||||
Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -263,7 +265,7 @@ public class HTMLStripReaderTest extends TestCase {
|
||||||
|
|
||||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||||
String gold = " ";
|
String gold = " ";
|
||||||
Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
|
@ -174,7 +174,6 @@
|
||||||
|
|
||||||
<%!
|
<%!
|
||||||
private static void doAnalyzer(JspWriter out, SchemaField field, String val, boolean queryAnalyser, boolean verbose, Set<Tok> match) throws Exception {
|
private static void doAnalyzer(JspWriter out, SchemaField field, String val, boolean queryAnalyser, boolean verbose, Set<Tok> match) throws Exception {
|
||||||
CharStream reader = CharReader.get(new StringReader(val));
|
|
||||||
|
|
||||||
FieldType ft = field.getType();
|
FieldType ft = field.getType();
|
||||||
Analyzer analyzer = queryAnalyser ?
|
Analyzer analyzer = queryAnalyser ?
|
||||||
|
@ -186,18 +185,18 @@
|
||||||
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
|
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
|
||||||
|
|
||||||
if( cfiltfacs != null ){
|
if( cfiltfacs != null ){
|
||||||
|
String source = val;
|
||||||
for(CharFilterFactory cfiltfac : cfiltfacs ){
|
for(CharFilterFactory cfiltfac : cfiltfacs ){
|
||||||
|
CharStream reader = CharReader.get(new StringReader(source));
|
||||||
reader = cfiltfac.create(reader);
|
reader = cfiltfac.create(reader);
|
||||||
if(verbose){
|
if(verbose){
|
||||||
writeHeader(out, cfiltfac.getClass(), cfiltfac.getArgs());
|
writeHeader(out, cfiltfac.getClass(), cfiltfac.getArgs());
|
||||||
writeCharStream(out, reader);
|
source = writeCharStream(out, reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// StringReader should support reset()
|
TokenStream tstream = tfac.create(tchain.charStream(new StringReader(val)));
|
||||||
reader.reset();
|
|
||||||
TokenStream tstream = tfac.create(reader);
|
|
||||||
List<Token> tokens = getTokens(tstream);
|
List<Token> tokens = getTokens(tstream);
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
writeHeader(out, tfac.getClass(), tfac.getArgs());
|
writeHeader(out, tfac.getClass(), tfac.getArgs());
|
||||||
|
@ -223,7 +222,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
TokenStream tstream = analyzer.tokenStream(field.getName(),reader);
|
TokenStream tstream = analyzer.tokenStream(field.getName(),new StringReader(val));
|
||||||
List<Token> tokens = getTokens(tstream);
|
List<Token> tokens = getTokens(tstream);
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
writeHeader(out, analyzer.getClass(), new HashMap<String,String>());
|
writeHeader(out, analyzer.getClass(), new HashMap<String,String>());
|
||||||
|
@ -468,7 +467,7 @@
|
||||||
out.println("</table>");
|
out.println("</table>");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void writeCharStream(JspWriter out, CharStream input) throws IOException {
|
static String writeCharStream(JspWriter out, CharStream input) throws IOException {
|
||||||
out.println("<table width=\"auto\" class=\"analysis\" border=\"1\">");
|
out.println("<table width=\"auto\" class=\"analysis\" border=\"1\">");
|
||||||
out.println("<tr>");
|
out.println("<tr>");
|
||||||
|
|
||||||
|
@ -476,8 +475,6 @@
|
||||||
XML.escapeCharData("text",out);
|
XML.escapeCharData("text",out);
|
||||||
out.println("</th>");
|
out.println("</th>");
|
||||||
|
|
||||||
// StringReader should support reset()
|
|
||||||
input.reset();
|
|
||||||
final int BUFFER_SIZE = 1024;
|
final int BUFFER_SIZE = 1024;
|
||||||
char[] buf = new char[BUFFER_SIZE];
|
char[] buf = new char[BUFFER_SIZE];
|
||||||
int len = 0;
|
int len = 0;
|
||||||
|
@ -492,6 +489,7 @@
|
||||||
|
|
||||||
out.println("</tr>");
|
out.println("</tr>");
|
||||||
out.println("</table>");
|
out.println("</table>");
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
%>
|
%>
|
||||||
|
|
Loading…
Reference in New Issue