mirror of https://github.com/apache/lucene.git
SOLR-253 -- adding KeepWordFilter, the inverse of StopFilter.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@544328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
407ca95deb
commit
cc053495de
|
@ -32,7 +32,11 @@ New Features
|
|||
1. SOLR-69: Adding MoreLikeThisHandler to search for similar documents using
|
||||
lucene contrib/queries MoreLikeThis. MoreLikeThis is also avaliable from
|
||||
the StandardRequestHandler using ?mlt=true. (bdelacretaz, ryan)
|
||||
|
||||
|
||||
2. SOLR-253: Adding KeepWordFilter and KeepWordFilterFactory. A TokenFilter
|
||||
that keeps tokens with text in the registered keeplist. This behaves like
|
||||
the inverse of StopFilter. (ryan)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A TokenFilter that only keeps tokens with text contained in the
|
||||
* required words. This filter behaves like the inverse of StopFilter.
|
||||
*
|
||||
* @author ryan
|
||||
* @version $Id$
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public final class KeepWordFilter extends TokenFilter {
|
||||
final Set<String> words;
|
||||
final boolean ignoreCase;
|
||||
|
||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||
super(in);
|
||||
this.words=words;
|
||||
this.ignoreCase=ignoreCase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
for (Token token=input.next(); token!=null; token=input.next()) {
|
||||
String txt = ignoreCase
|
||||
? token.termText().toLowerCase()
|
||||
: token.termText();
|
||||
|
||||
if( words.contains( txt ) ) {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.core.Config;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author ryan
|
||||
* @version $Id$
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class KeepWordFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
private Set<String> words;
|
||||
private boolean ignoreCase;
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
String wordFile = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase",false);
|
||||
|
||||
if (wordFile != null) {
|
||||
try {
|
||||
List<String> wlist = Config.getLines(wordFile);
|
||||
words = StopFilter.makeStopSet(
|
||||
(String[])wlist.toArray(new String[0]), ignoreCase);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the keep word list.
|
||||
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
||||
*/
|
||||
public void setWords(Set<String> words) {
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
public void setIgnoreCase(boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new KeepWordFilter(input,words,ignoreCase);
|
||||
}
|
||||
}
|
|
@ -150,7 +150,6 @@ public abstract class BaseTokenTestCase extends TestCase
|
|||
// These may be useful beyond test cases...
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
// This could probably be put in a utility class
|
||||
static List<Token> getTokens(TokenStream tstream) throws IOException {
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
while (true) {
|
||||
|
@ -161,12 +160,14 @@ public abstract class BaseTokenTestCase extends TestCase
|
|||
return tokens;
|
||||
}
|
||||
|
||||
// This could probably be put in a utility class
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
}
|
||||
public IterTokenStream(Iterable<Token> toks) {
|
||||
this.toks = toks.iterator();
|
||||
}
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestKeepWordFilter extends BaseTokenTestCase {
|
||||
|
||||
public void testStopAndGo() throws Exception
|
||||
{
|
||||
Set<String> words = new HashSet<String>();
|
||||
words.add( "aaa" );
|
||||
words.add( "bbb" );
|
||||
|
||||
List<Token> input = tokens( "aaa BBB ccc ddd EEE" );
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
|
||||
// Test Stopwords
|
||||
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
||||
args.put( "ignoreCase", "true" );
|
||||
factory.init( args );
|
||||
factory.setWords( words );
|
||||
|
||||
List<Token> expect = tokens( "aaa BBB" );
|
||||
List<Token> real = getTokens(factory.create( new IterTokenStream(input) ));
|
||||
assertTokEqual( expect, real );
|
||||
|
||||
// Now force case
|
||||
args.put( "ignoreCase", "false" );
|
||||
factory.init( args );
|
||||
|
||||
expect = tokens( "aaa" );
|
||||
real = getTokens(factory.create( new IterTokenStream(input) ));
|
||||
assertTokEqual( expect, real );
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue