LUCENE-2413: consolidate HTMLStripCharFilter into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940768 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 08:33:52 +00:00
parent 1d1331fa03
commit a9ef636cb1
7 changed files with 11 additions and 8 deletions

View File

@ -159,6 +159,8 @@ New features
New features from Solr now available to Lucene users include:
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
and phrases.
- o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML
constructs.
(... in progress)
Build

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis;
package org.apache.lucene.analysis.charfilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -30,8 +30,6 @@ import org.apache.lucene.analysis.CharStream;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
*
* @version $Id$
*/
public class HTMLStripCharFilter extends BaseCharFilter {
private int readAheadLimit = DEFAULT_READ_AHEAD;

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis;
package org.apache.lucene.analysis.charfilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -21,6 +21,8 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
@ -69,8 +71,8 @@ public class HTMLStripCharFilterTest extends TestCase {
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(new File("htmlStripReaderTest.html"))));
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){

View File

@ -16,7 +16,7 @@
*/
package org.apache.solr.handler.dataimport;
import org.apache.solr.analysis.HTMLStripCharFilter;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.CharReader;
import java.io.IOException;

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
*/
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {

View File

@ -311,7 +311,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
NamedList indexPart = textType.get("index");
assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
assertEquals(" whátëvêr ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter"));
assertEquals(" whátëvêr ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
List<NamedList> tokenList = (List<NamedList>)indexPart.get("org.apache.lucene.analysis.WhitespaceTokenizer");