LUCENE-3671: Add TypeTokenFilter that filters tokens based on their TypeAttribute

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2012-01-21 19:02:44 +00:00
parent 9b86beea98
commit af9b4d816f
3 changed files with 142 additions and 0 deletions

View File

@ -790,6 +790,9 @@ New Features
input mapping to it) for FSTs that have strictly monotonic long input mapping to it) for FSTs that have strictly monotonic long
outputs (such as an ord). (Mike McCandless) outputs (such as an ord). (Mike McCandless)
* LUCENE-3121: Add TypeTokenFilter that filters tokens based on
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
Bug fixes Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter * LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.core;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import java.io.IOException;
import java.util.Set;
/**
* Removes tokens whose types appear in a set of blocked types from a token stream.
*/
public final class TypeTokenFilter extends FilteringTokenFilter {
private final Set<String> stopTypes;
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
super(enablePositionIncrements, input);
this.stopTypes = stopTypes;
}
/**
* Returns the next input Token whose typeAttribute.type() is not a stop type.
*/
@Override
protected boolean accept() throws IOException {
return !stopTypes.contains(typeAttribute.type());
}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis.core;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.English;
import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilter() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = asSet("<NUM>");
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String stopTypes[] = new String[]{"<NUM>"};
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
private void testPositons(TypeTokenFilter stpf) throws IOException {
TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
stpf.reset();
boolean enablePositionIncrements = stpf.getEnablePositionIncrements();
while (stpf.incrementToken()) {
log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1",
posIncrAtt.getPositionIncrement(), enablePositionIncrements ? 3 : 1);
}
stpf.end();
stpf.close();
}
// print debug info depending on VERBOSE
private static void log(String s) {
if (VERBOSE) {
System.out.println(s);
}
}
}