LUCENE-9574 Add DropIfFlaggedFilterFactory (#1979)

This commit is contained in:
Gus Heck 2020-10-14 13:26:35 -04:00 committed by GitHub
parent b43c389386
commit ab5671d367
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 251 additions and 0 deletions

View File

@ -190,6 +190,13 @@ Other
* LUCENE-9544: add regenerate gradle script for nori dictionary (Namgyu Kim)
======================= Lucene 8.8.0 =======================
New Features
---------------------
* LUCENE-9574 A token filter to drop tokens that match all specified flags.
======================= Lucene 8.7.0 =======================
API Changes

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
/**
* Allows Tokens with a given combination of flags to be dropped. If all flags specified are present
* the token is dropped, otherwise it is retained.
*
* @see DropIfFlaggedFilterFactory
* @since 8.8.0
*/
public final class DropIfFlaggedFilter extends FilteringTokenFilter {
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final int dropFlags;
/**
* Construct a token stream filtering the given input.
*
* @param input the source stream
* @param dropFlags a combination of flags that indicates that the token should be dropped.
*/
public DropIfFlaggedFilter(TokenStream input, int dropFlags) {
super(input);
this.dropFlags = dropFlags;
}
@Override
protected boolean accept() {
return (flagsAtt.getFlags() & dropFlags) != dropFlags;
}
}

View File

@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
/**
* Provides a filter that will drop tokens matching a set of flags. This might be used if you had
* both custom filters that identify tokens to be removed, but need to run before other filters that
* want to see the token that will eventually be dropped. Alternately you might have separate flag setting
* filters and then remove tokens that match a particular combination of those filters.<br>
* <br>
* In Solr this might be configured such as
* <pre class="prettyprint">
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;-- other filters --&gt;
* &lt;filter class="solr.DropIfFlaggedFilterFactory" dropFlags="9"/&gt;
* &lt;/analyzer&gt;
* </pre>
* The above would drop any token that had the first and fourth bit set.
*
* @since 8.8.0
* @lucene.spi {@value #NAME}
*/
public final class DropIfFlaggedFilterFactory extends TokenFilterFactory {
/**
* SPI name
*/
public static final String NAME = "dropIfFlagged";
private final int dropFlags;
/**
* Initialize this factory via a set of key-value pairs.
*/
public DropIfFlaggedFilterFactory(Map<String, String> args) {
super(args);
dropFlags = getInt(args,"dropFlags", 2);
}
/** Default ctor for compatibility with SPI */
public DropIfFlaggedFilterFactory() {
throw defaultCtorException();
}
@Override
public TokenStream create(TokenStream input) {
return new DropIfFlaggedFilter(input, dropFlags);
}
}

View File

@ -70,6 +70,7 @@ org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.DropIfFlaggedFilterFactory
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory

View File

@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Test that this filter removes tokens that match a particular set of flags.
*/
public class TestDropIfFlaggedFilter extends BaseTokenStreamTestCase {
/**
* Test the straight forward cases. When all flags match the token should be dropped
*/
public void testDropped() throws Exception {
Token token = new Token("foo", 0, 2);
Token token2 = new Token("bar", 4, 6);
Token token3 = new Token("baz", 8, 10);
Token token4 = new Token("bam", 12, 14);
token.setFlags(0); // 000 no flags match
token2.setFlags(1);// 001 one flag matches
token3.setFlags(2);// 010 no flags match
token4.setFlags(7);// 111 both flags match (drop)
TokenStream ts = new CannedTokenStream(token, token2, token3, token4);
ts = new DropIfFlaggedFilter(ts, 5); // 101
assertTokenStreamContents(ts, new String[]{
"foo", "bar", "baz"}, new int[]{0, 4, 8}, new int[]{2, 6, 10}, new int[]{1, 1, 1});
}
/**
* Test where the first and last token are dropped.
*/
public void testDroppedFirst() throws Exception {
Token token = new Token("foo", 0, 2);
Token token2 = new Token("bar", 4, 6);
Token token3 = new Token("baz", 8, 10);
Token token4 = new Token("bam", 12, 14);
token.setFlags(4); // 100 flag matches (drop)
token2.setFlags(1);// 001 no flags match
token3.setFlags(2);// 010 no flags match
token4.setFlags(7);// 111 flag matches (drop)
TokenStream ts = new CannedTokenStream(token, token2, token3, token4);
ts = new DropIfFlaggedFilter(ts, 4) ;
assertTokenStreamContents(ts, new String[]{
"bar", "baz"}, new int[]{ 4, 8}, new int[]{6, 10}, new int[]{2, 1});
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
/**
* This test just ensures the factory works, detailed tests in {@link TestDropIfFlaggedFilter}
*/
public class TestDropIfFlaggedFilterFactory extends BaseTokenStreamFactoryTestCase {
private static final Token[] TOKENS = { token("foo",1,0,2), token("bar",3, 4,6) };
public void testFactory() throws Exception {
TokenStream stream = new CannedTokenStream(TOKENS);
TokenFilterFactory tokenFilterFactory = tokenFilterFactory("dropIfFlagged", "flags", "2");
stream = tokenFilterFactory.create(stream);
assertTokenStreamContents(stream, new String[] { "foo" }, null, null, new String[] { "word",}, new int[] { 1 });
}
private static Token token(String term, int flags, int soff, int eoff) {
Token token = new Token();
token.setEmpty();
token.append(term);
token.setFlags(flags);
token.setOffset(soff,eoff);
return token;
}
}