mirror of https://github.com/apache/lucene.git
LUCENE-9574 Add DropIfFlaggedFilterFactory (#1979)
This commit is contained in:
parent
b43c389386
commit
ab5671d367
|
@ -190,6 +190,13 @@ Other
|
|||
|
||||
* LUCENE-9544: add regenerate gradle script for nori dictionary (Namgyu Kim)
|
||||
|
||||
======================= Lucene 8.8.0 =======================
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
* LUCENE-9574 A token filter to drop tokens that match all specified flags.
|
||||
|
||||
======================= Lucene 8.7.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
||||
/**
|
||||
* Allows Tokens with a given combination of flags to be dropped. If all flags specified are present
|
||||
* the token is dropped, otherwise it is retained.
|
||||
*
|
||||
* @see DropIfFlaggedFilterFactory
|
||||
* @since 8.8.0
|
||||
*/
|
||||
public final class DropIfFlaggedFilter extends FilteringTokenFilter {
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final int dropFlags;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
*
|
||||
* @param input the source stream
|
||||
* @param dropFlags a combination of flags that indicates that the token should be dropped.
|
||||
*/
|
||||
public DropIfFlaggedFilter(TokenStream input, int dropFlags) {
|
||||
super(input);
|
||||
this.dropFlags = dropFlags;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean accept() {
|
||||
return (flagsAtt.getFlags() & dropFlags) != dropFlags;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Provides a filter that will drop tokens matching a set of flags. This might be used if you had
|
||||
* both custom filters that identify tokens to be removed, but need to run before other filters that
|
||||
* want to see the token that will eventually be dropped. Alternately you might have separate flag setting
|
||||
* filters and then remove tokens that match a particular combination of those filters.<br>
|
||||
* <br>
|
||||
* In Solr this might be configured such as
|
||||
* <pre class="prettyprint">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <-- other filters -->
|
||||
* <filter class="solr.DropIfFlaggedFilterFactory" dropFlags="9"/>
|
||||
* </analyzer>
|
||||
* </pre>
|
||||
* The above would drop any token that had the first and fourth bit set.
|
||||
*
|
||||
* @since 8.8.0
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public final class DropIfFlaggedFilterFactory extends TokenFilterFactory {
|
||||
/**
|
||||
* SPI name
|
||||
*/
|
||||
public static final String NAME = "dropIfFlagged";
|
||||
|
||||
private final int dropFlags;
|
||||
|
||||
/**
|
||||
* Initialize this factory via a set of key-value pairs.
|
||||
*/
|
||||
public DropIfFlaggedFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
dropFlags = getInt(args,"dropFlags", 2);
|
||||
|
||||
}
|
||||
|
||||
/** Default ctor for compatibility with SPI */
|
||||
public DropIfFlaggedFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new DropIfFlaggedFilter(input, dropFlags);
|
||||
}
|
||||
}
|
|
@ -70,6 +70,7 @@ org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DropIfFlaggedFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Test that this filter removes tokens that match a particular set of flags.
|
||||
*/
|
||||
public class TestDropIfFlaggedFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Test the straight forward cases. When all flags match the token should be dropped
|
||||
*/
|
||||
public void testDropped() throws Exception {
|
||||
|
||||
Token token = new Token("foo", 0, 2);
|
||||
Token token2 = new Token("bar", 4, 6);
|
||||
Token token3 = new Token("baz", 8, 10);
|
||||
Token token4 = new Token("bam", 12, 14);
|
||||
|
||||
token.setFlags(0); // 000 no flags match
|
||||
token2.setFlags(1);// 001 one flag matches
|
||||
token3.setFlags(2);// 010 no flags match
|
||||
token4.setFlags(7);// 111 both flags match (drop)
|
||||
|
||||
TokenStream ts = new CannedTokenStream(token, token2, token3, token4);
|
||||
ts = new DropIfFlaggedFilter(ts, 5); // 101
|
||||
|
||||
assertTokenStreamContents(ts, new String[]{
|
||||
"foo", "bar", "baz"}, new int[]{0, 4, 8}, new int[]{2, 6, 10}, new int[]{1, 1, 1});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test where the first and last token are dropped.
|
||||
*/
|
||||
public void testDroppedFirst() throws Exception {
|
||||
|
||||
Token token = new Token("foo", 0, 2);
|
||||
Token token2 = new Token("bar", 4, 6);
|
||||
Token token3 = new Token("baz", 8, 10);
|
||||
Token token4 = new Token("bam", 12, 14);
|
||||
|
||||
token.setFlags(4); // 100 flag matches (drop)
|
||||
token2.setFlags(1);// 001 no flags match
|
||||
token3.setFlags(2);// 010 no flags match
|
||||
token4.setFlags(7);// 111 flag matches (drop)
|
||||
|
||||
TokenStream ts = new CannedTokenStream(token, token2, token3, token4);
|
||||
ts = new DropIfFlaggedFilter(ts, 4) ;
|
||||
|
||||
assertTokenStreamContents(ts, new String[]{
|
||||
"bar", "baz"}, new int[]{ 4, 8}, new int[]{6, 10}, new int[]{2, 1});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* This test just ensures the factory works, detailed tests in {@link TestDropIfFlaggedFilter}
|
||||
*/
|
||||
public class TestDropIfFlaggedFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
private static final Token[] TOKENS = { token("foo",1,0,2), token("bar",3, 4,6) };
|
||||
|
||||
public void testFactory() throws Exception {
|
||||
TokenStream stream = new CannedTokenStream(TOKENS);
|
||||
TokenFilterFactory tokenFilterFactory = tokenFilterFactory("dropIfFlagged", "flags", "2");
|
||||
stream = tokenFilterFactory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "foo" }, null, null, new String[] { "word",}, new int[] { 1 });
|
||||
}
|
||||
|
||||
private static Token token(String term, int flags, int soff, int eoff) {
|
||||
Token token = new Token();
|
||||
token.setEmpty();
|
||||
token.append(term);
|
||||
token.setFlags(flags);
|
||||
token.setOffset(soff,eoff);
|
||||
return token;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue