LUCENE-2413: Consolidate KeepWords,HyphenatedWords,Trim filters to contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940962 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 17:07:28 +00:00
parent 1fdaf68d0d
commit 27502aa045
12 changed files with 127 additions and 110 deletions

View File

@ -57,6 +57,12 @@ New features
into subwords and performs optional transformations on subword groups.
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
filters out Tokens at the same position and Term text as the previous token.
- o.a.l.analysis.miscellaneous.TrimFilter: Trims leading and trailing whitespace
from Tokens in the stream.
- o.a.l.analysis.miscellaneous.KeepWordFilter: A TokenFilter that only keeps tokens
with text contained in the required words (inverse of StopFilter).
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
hyphenated words broken into two lines back together.
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
(... in progress)

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -29,14 +29,13 @@ import java.util.Set;
* A TokenFilter that only keeps tokens with text contained in the
* required words. This filter behaves like the inverse of StopFilter.
*
* @version $Id$
* @since solr 1.3
*/
public final class KeepWordFilter extends TokenFilter {
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
/** @deprecated Use {@link #KeepWordFilter(TokenStream, CharArraySet)} instead */
@Deprecated
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
this(in, new CharArraySet(words, ignoreCase));

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -26,8 +26,6 @@ import java.io.IOException;
/**
* Trims leading and trailing whitespace from Tokens in the stream.
*
* @version $Id:$
*/
public final class TrimFilter extends TokenFilter {

View File

@ -15,23 +15,23 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* HyphenatedWordsFilter test
*/
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
}
@ -42,9 +42,8 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}

View File

@ -0,0 +1,49 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/** Test {@link KeepWordFilter} */
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
public void testStopAndGo() throws Exception
{
Set<String> words = new HashSet<String>();
words.add( "aaa" );
words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE";
// Test Stopwords
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, words, true);
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, words, false);
assertTokenStreamContents(stream, new String[] { "aaa" });
}
}

View File

@ -15,13 +15,12 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@ -34,7 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* @version $Id:$
*/
public class TestTrimFilter extends BaseTokenTestCase {
public class TestTrimFilter extends BaseTokenStreamTestCase {
public void testTrim() throws Exception {
char[] a = " a ".toCharArray();
@ -42,15 +41,13 @@ public class TestTrimFilter extends BaseTokenTestCase {
char[] ccc = "cCc".toCharArray();
char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray();
TrimFilterFactory factory = new TrimFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("updateOffsets", "false");
factory.init(args);
TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21)));
new Token(empty, 0, empty.length, 21, 21));
ts = new TrimFilter(ts, false);
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
@ -58,15 +55,12 @@ public class TestTrimFilter extends BaseTokenTestCase {
b = "b ".toCharArray();
ccc = " c ".toCharArray();
whitespace = " ".toCharArray();
factory = new TrimFilterFactory();
args = new HashMap<String,String>();
args.put("updateOffsets", "true");
factory.init(args);
ts = factory.create(new IterTokenStream(
ts = new IterTokenStream(
new Token(a, 0, a.length, 0, 2),
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3)));
new Token(whitespace, 0, whitespace.length, 0, 3));
ts = new TrimFilter(ts, true);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "" },

View File

@ -18,6 +18,7 @@ package org.apache.solr.analysis;
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.solr.analysis.BaseTokenFilterFactory;
/**

View File

@ -21,6 +21,7 @@ import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import java.util.Set;
import java.io.IOException;

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.solr.common.SolrException;
/**

View File

@ -1,79 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader;
/**
* @version $Id$
*/
public class TestKeepWordFilter extends BaseTokenTestCase {
public void testStopAndGo() throws Exception
{
Set<String> words = new HashSet<String>();
words.add( "aaa" );
words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE";
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
// Test Stopwords
KeepWordFilterFactory factory = new KeepWordFilterFactory();
args.put( "ignoreCase", "true" );
factory.init( args );
factory.inform( loader );
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Test Stopwords (ignoreCase via the setter instead)
factory = new KeepWordFilterFactory();
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init( args );
factory.inform( loader );
factory.setIgnoreCase(true);
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case
factory = new KeepWordFilterFactory();
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( "ignoreCase", "false" );
factory.init( args );
factory.inform( loader );
factory.setWords( words );
assertFalse(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa" });
}
}

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* Simple tests to ensure this factory is working
*/
public class TestTrimFilterFactory extends BaseTokenTestCase {
public void testTrimming() throws Exception {
TrimFilterFactory factory = new TrimFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("updateOffsets", "false");
factory.init(args);
TokenStream ts = factory.create(new KeywordTokenizer(new StringReader("trim me ")));
assertTokenStreamContents(ts, new String[] { "trim me" });
}
}