mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate HTMLStripCharFilter into contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940768 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1d1331fa03
commit
a9ef636cb1
|
@ -159,6 +159,8 @@ New features
|
|||
New features from Solr now available to Lucene users include:
|
||||
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
||||
and phrases.
|
||||
- o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML
|
||||
constructs.
|
||||
(... in progress)
|
||||
|
||||
Build
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -30,8 +30,6 @@ import org.apache.lucene.analysis.CharStream;
|
|||
|
||||
/**
|
||||
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class HTMLStripCharFilter extends BaseCharFilter {
|
||||
private int readAheadLimit = DEFAULT_READ_AHEAD;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -21,6 +21,8 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
|
@ -69,8 +71,8 @@ public class HTMLStripCharFilterTest extends TestCase {
|
|||
|
||||
//Some sanity checks, but not a full-fledged check
|
||||
public void testHTML() throws Exception {
|
||||
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(new File("htmlStripReaderTest.html"))));
|
||||
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
while ((ch = reader.read()) != -1){
|
|
@ -16,7 +16,7 @@
|
|||
*/
|
||||
package org.apache.solr.handler.dataimport;
|
||||
|
||||
import org.apache.solr.analysis.HTMLStripCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
|
||||
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
|
||||
|
||||
|
|
|
@ -311,7 +311,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
NamedList indexPart = textType.get("index");
|
||||
assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
|
||||
|
||||
assertEquals(" whátëvêr ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter"));
|
||||
assertEquals(" whátëvêr ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
|
||||
assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
|
||||
|
||||
List<NamedList> tokenList = (List<NamedList>)indexPart.get("org.apache.lucene.analysis.WhitespaceTokenizer");
|
||||
|
|
Loading…
Reference in New Issue