diff --git a/CHANGES.txt b/CHANGES.txt index 15ef2047ca2..63a2da4c3d2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -32,7 +32,11 @@ New Features 1. SOLR-69: Adding MoreLikeThisHandler to search for similar documents using lucene contrib/queries MoreLikeThis. MoreLikeThis is also avaliable from the StandardRequestHandler using ?mlt=true. (bdelacretaz, ryan) - + + 2. SOLR-253: Adding KeepWordFilter and KeepWordFilterFactory. A TokenFilter + that keeps tokens with text in the registered keeplist. This behaves like + the inverse of StopFilter. (ryan) + Changes in runtime behavior Optimizations diff --git a/src/java/org/apache/solr/analysis/KeepWordFilter.java b/src/java/org/apache/solr/analysis/KeepWordFilter.java new file mode 100644 index 00000000000..df17de37196 --- /dev/null +++ b/src/java/org/apache/solr/analysis/KeepWordFilter.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; + +import java.io.IOException; +import java.util.Set; + +/** + * A TokenFilter that only keeps tokens with text contained in the + * required words. This filter behaves like the inverse of StopFilter. + * + * @author ryan + * @version $Id$ + * @since solr 1.3 + */ +public final class KeepWordFilter extends TokenFilter { + final Set words; + final boolean ignoreCase; + + public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { + super(in); + this.words=words; + this.ignoreCase=ignoreCase; + } + + @Override + public final Token next() throws IOException { + for (Token token=input.next(); token!=null; token=input.next()) { + String txt = ignoreCase + ? token.termText().toLowerCase() + : token.termText(); + + if( words.contains( txt ) ) { + return token; + } + } + return null; + } +} diff --git a/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java b/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java new file mode 100644 index 00000000000..d740261ddac --- /dev/null +++ b/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.solr.core.Config; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.util.Map; +import java.util.List; +import java.util.Set; +import java.io.IOException; + +/** + * @author ryan + * @version $Id$ + * @since solr 1.3 + */ +public class KeepWordFilterFactory extends BaseTokenFilterFactory { + + private Set words; + private boolean ignoreCase; + + @Override + @SuppressWarnings("unchecked") + public void init(Map args) { + super.init(args); + String wordFile = args.get("words"); + ignoreCase = getBoolean("ignoreCase",false); + + if (wordFile != null) { + try { + List wlist = Config.getLines(wordFile); + words = StopFilter.makeStopSet( + (String[])wlist.toArray(new String[0]), ignoreCase); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + /** + * Set the keep word list. + * NOTE: if ignoreCase==true, the words are expected to be lowercase + */ + public void setWords(Set words) { + this.words = words; + } + + public void setIgnoreCase(boolean ignoreCase) { + this.ignoreCase = ignoreCase; + } + + public TokenStream create(TokenStream input) { + return new KeepWordFilter(input,words,ignoreCase); + } +} diff --git a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java index 497d0922a5e..da639092f3d 100644 --- a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java +++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java @@ -150,7 +150,6 @@ public abstract class BaseTokenTestCase extends TestCase // These may be useful beyond test cases... //------------------------------------------------------------------------ - // This could probably be put in a utility class static List getTokens(TokenStream tstream) throws IOException { List tokens = new ArrayList(); while (true) { @@ -161,12 +160,14 @@ public abstract class BaseTokenTestCase extends TestCase return tokens; } - // This could probably be put in a utility class public static class IterTokenStream extends TokenStream { Iterator toks; public IterTokenStream(Token... toks) { this.toks = Arrays.asList(toks).iterator(); } + public IterTokenStream(Iterable toks) { + this.toks = toks.iterator(); + } public IterTokenStream(Iterator toks) { this.toks = toks; } diff --git a/src/test/org/apache/solr/analysis/TestKeepWordFilter.java b/src/test/org/apache/solr/analysis/TestKeepWordFilter.java new file mode 100644 index 00000000000..bb3007952b7 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestKeepWordFilter.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Token; + + +/** + * @version $Id:$ + */ +public class TestKeepWordFilter extends BaseTokenTestCase { + + public void testStopAndGo() throws Exception + { + Set words = new HashSet(); + words.add( "aaa" ); + words.add( "bbb" ); + + List input = tokens( "aaa BBB ccc ddd EEE" ); + Map args = new HashMap(); + + + // Test Stopwords + KeepWordFilterFactory factory = new KeepWordFilterFactory(); + args.put( "ignoreCase", "true" ); + factory.init( args ); + factory.setWords( words ); + + List expect = tokens( "aaa BBB" ); + List real = getTokens(factory.create( new IterTokenStream(input) )); + assertTokEqual( expect, real ); + + // Now force case + args.put( "ignoreCase", "false" ); + factory.init( args ); + + expect = tokens( "aaa" ); + real = getTokens(factory.create( new IterTokenStream(input) )); + assertTokEqual( expect, real ); + } +}