mirror of https://github.com/apache/lucene.git
LUCENE-2413: Consolidate KeepWords,HyphenatedWords,Trim filters to contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940962 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1fdaf68d0d
commit
27502aa045
|
@ -57,6 +57,12 @@ New features
|
|||
into subwords and performs optional transformations on subword groups.
|
||||
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
|
||||
filters out Tokens at the same position and Term text as the previous token.
|
||||
- o.a.l.analysis.miscellaneous.TrimFilter: Trims leading and trailing whitespace
|
||||
from Tokens in the stream.
|
||||
- o.a.l.analysis.miscellaneous.KeepWordFilter: A TokenFilter that only keeps tokens
|
||||
with text contained in the required words (inverse of StopFilter).
|
||||
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
|
||||
hyphenated words broken into two lines back together.
|
||||
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
|
||||
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
|
||||
(... in progress)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -29,14 +29,13 @@ import java.util.Set;
|
|||
* A TokenFilter that only keeps tokens with text contained in the
|
||||
* required words. This filter behaves like the inverse of StopFilter.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public final class KeepWordFilter extends TokenFilter {
|
||||
private final CharArraySet words;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
|
||||
/** @deprecated Use {@link #KeepWordFilter(TokenStream, CharArraySet)} instead */
|
||||
@Deprecated
|
||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||
this(in, new CharArraySet(words, ignoreCase));
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -26,8 +26,6 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* Trims leading and trailing whitespace from Tokens in the stream.
|
||||
*
|
||||
* @version $Id:$
|
||||
*/
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
|
@ -15,23 +15,23 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* HyphenatedWordsFilter test
|
||||
*/
|
||||
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||
public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
|
||||
}
|
||||
|
@ -42,9 +42,8 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
|||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/** Test {@link KeepWordFilter} */
|
||||
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testStopAndGo() throws Exception
|
||||
{
|
||||
Set<String> words = new HashSet<String>();
|
||||
words.add( "aaa" );
|
||||
words.add( "bbb" );
|
||||
|
||||
String input = "aaa BBB ccc ddd EEE";
|
||||
|
||||
// Test Stopwords
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(stream, words, true);
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Now force case
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(stream, words, false);
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||
}
|
||||
}
|
|
@ -15,13 +15,12 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
@ -34,7 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestTrimFilter extends BaseTokenTestCase {
|
||||
public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testTrim() throws Exception {
|
||||
char[] a = " a ".toCharArray();
|
||||
|
@ -42,15 +41,13 @@ public class TestTrimFilter extends BaseTokenTestCase {
|
|||
char[] ccc = "cCc".toCharArray();
|
||||
char[] whitespace = " ".toCharArray();
|
||||
char[] empty = "".toCharArray();
|
||||
TrimFilterFactory factory = new TrimFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("updateOffsets", "false");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
|
||||
|
||||
TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5),
|
||||
new Token(b, 0, b.length, 6, 10),
|
||||
new Token(ccc, 0, ccc.length, 11, 15),
|
||||
new Token(whitespace, 0, whitespace.length, 16, 20),
|
||||
new Token(empty, 0, empty.length, 21, 21)));
|
||||
new Token(empty, 0, empty.length, 21, 21));
|
||||
ts = new TrimFilter(ts, false);
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
||||
|
||||
|
@ -58,15 +55,12 @@ public class TestTrimFilter extends BaseTokenTestCase {
|
|||
b = "b ".toCharArray();
|
||||
ccc = " c ".toCharArray();
|
||||
whitespace = " ".toCharArray();
|
||||
factory = new TrimFilterFactory();
|
||||
args = new HashMap<String,String>();
|
||||
args.put("updateOffsets", "true");
|
||||
factory.init(args);
|
||||
ts = factory.create(new IterTokenStream(
|
||||
ts = new IterTokenStream(
|
||||
new Token(a, 0, a.length, 0, 2),
|
||||
new Token(b, 0, b.length, 0, 2),
|
||||
new Token(ccc, 0, ccc.length, 0, 3),
|
||||
new Token(whitespace, 0, whitespace.length, 0, 3)));
|
||||
new Token(whitespace, 0, whitespace.length, 0, 3));
|
||||
ts = new TrimFilter(ts, true);
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "a", "b", "c", "" },
|
|
@ -18,6 +18,7 @@ package org.apache.solr.analysis;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
|
||||
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.solr.common.ResourceLoader;
|
|||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
||||
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TestKeepWordFilter extends BaseTokenTestCase {
|
||||
|
||||
public void testStopAndGo() throws Exception
|
||||
{
|
||||
Set<String> words = new HashSet<String>();
|
||||
words.add( "aaa" );
|
||||
words.add( "bbb" );
|
||||
|
||||
String input = "aaa BBB ccc ddd EEE";
|
||||
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
|
||||
// Test Stopwords
|
||||
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
||||
args.put( "ignoreCase", "true" );
|
||||
factory.init( args );
|
||||
factory.inform( loader );
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Test Stopwords (ignoreCase via the setter instead)
|
||||
factory = new KeepWordFilterFactory();
|
||||
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init( args );
|
||||
factory.inform( loader );
|
||||
factory.setIgnoreCase(true);
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Now force case
|
||||
factory = new KeepWordFilterFactory();
|
||||
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put( "ignoreCase", "false" );
|
||||
factory.init( args );
|
||||
factory.inform( loader );
|
||||
factory.setWords( words );
|
||||
assertFalse(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure this factory is working
|
||||
*/
|
||||
public class TestTrimFilterFactory extends BaseTokenTestCase {
|
||||
public void testTrimming() throws Exception {
|
||||
TrimFilterFactory factory = new TrimFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("updateOffsets", "false");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create(new KeywordTokenizer(new StringReader("trim me ")));
|
||||
assertTokenStreamContents(ts, new String[] { "trim me" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue