mirror of https://github.com/apache/lucene.git
LUCENE-8273: Rename TermExclusionFilter -> ProtectedTermFilter. Allow ProtectedTermFilterFactory to be used outside of CustomAnalyzer, including in Solr, by allowing wrapped filters and their parameters to be specified on construction. Add tests for ProtectedTermFilterFactory in lucene/common/analysis/ and in solr/core/. Add Solr ref guide documentation for ProtectedTermFilterFactory. Improve javadocs for CustomAnalyzer, ConditionalTokenFilter, and ProtectedTermFilter.
This commit is contained in:
parent
f506bc9cb7
commit
d91273ddf0
|
@ -183,8 +183,8 @@ New Features
|
||||||
filters based on the attributes of the current token. This generalises the keyword
|
filters based on the attributes of the current token. This generalises the keyword
|
||||||
token logic currently used for stemmers and WDF. It is integrated into
|
token logic currently used for stemmers and WDF. It is integrated into
|
||||||
CustomAnalyzer by using the `when` and `whenTerm` builder methods, and a new
|
CustomAnalyzer by using the `when` and `whenTerm` builder methods, and a new
|
||||||
TermExclusionConditionalFilter is added as an example. (Alan Woodward,
|
ProtectedTermFilter is added as an example. (Alan Woodward, Robert Muir,
|
||||||
Robert Muir, David Smiley, Steve Rowe, Mike Sokolov)
|
David Smiley, Steve Rowe, Mike Sokolov)
|
||||||
|
|
||||||
* LUCENE-8310: Ensure IndexFileDeleter accounts for pending deletes. Today we fail
|
* LUCENE-8310: Ensure IndexFileDeleter accounts for pending deletes. Today we fail
|
||||||
creating the IndexWriter when the directory has a pending delete. Yet, this
|
creating the IndexWriter when the directory has a pending delete. Yet, this
|
||||||
|
|
|
@ -87,7 +87,7 @@ import static org.apache.lucene.analysis.util.AnalysisSPILoader.newFactoryClassI
|
||||||
* .whenTerm(t -> t.length() > 10)
|
* .whenTerm(t -> t.length() > 10)
|
||||||
* .addTokenFilter("reversestring")
|
* .addTokenFilter("reversestring")
|
||||||
* .endwhen()
|
* .endwhen()
|
||||||
* .build()
|
* .build();
|
||||||
* </pre>
|
* </pre>
|
||||||
*/
|
*/
|
||||||
public final class CustomAnalyzer extends Analyzer {
|
public final class CustomAnalyzer extends Analyzer {
|
||||||
|
|
|
@ -29,10 +29,10 @@ import org.apache.lucene.util.AttributeSource;
|
||||||
/**
|
/**
|
||||||
* Allows skipping TokenFilters based on the current set of attributes.
|
* Allows skipping TokenFilters based on the current set of attributes.
|
||||||
*
|
*
|
||||||
* To use, implement the {@link #shouldFilter()} method. If it returns {@code false},
|
* To use, implement the {@link #shouldFilter()} method. If it returns {@code true},
|
||||||
* then calling {@link #incrementToken()} will use the wrapped TokenFilter to
|
* then calling {@link #incrementToken()} will use the wrapped TokenFilter(s) to
|
||||||
* make changes to the tokenstream. If it returns {@code true}, then the wrapped
|
* make changes to the tokenstream. If it returns {@code false}, then the wrapped
|
||||||
* filter will be skipped
|
* filter(s) will be skipped.
|
||||||
*/
|
*/
|
||||||
public abstract class ConditionalTokenFilter extends TokenFilter {
|
public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
@ -102,9 +102,9 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new BypassingTokenFilter
|
* Create a new ConditionalTokenFilter
|
||||||
* @param input the input TokenStream
|
* @param input the input TokenStream
|
||||||
* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
|
* @param inputFactory a factory function to create the wrapped filter(s)
|
||||||
*/
|
*/
|
||||||
protected ConditionalTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
|
protected ConditionalTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
|
||||||
super(input);
|
super(input);
|
||||||
|
@ -112,7 +112,7 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether or not to execute the wrapped TokenFilter for the current token
|
* Whether or not to execute the wrapped TokenFilter(s) for the current token
|
||||||
*/
|
*/
|
||||||
protected abstract boolean shouldFilter() throws IOException;
|
protected abstract boolean shouldFilter() throws IOException;
|
||||||
|
|
||||||
|
|
|
@ -25,28 +25,29 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A ConditionalTokenFilter that only applies its wrapped filters to tokens that
|
* A ConditionalTokenFilter that only applies its wrapped filters to tokens that
|
||||||
* are not contained in an exclusion set.
|
* are not contained in a protected set.
|
||||||
*/
|
*/
|
||||||
public class TermExclusionFilter extends ConditionalTokenFilter {
|
public class ProtectedTermFilter extends ConditionalTokenFilter {
|
||||||
|
|
||||||
private final CharArraySet excludeTerms;
|
private final CharArraySet protectedTerms;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new TermExclusionFilter
|
* Creates a new ProtectedTermFilter
|
||||||
* @param excludeTerms the set of terms to skip the wrapped filters for
|
* @param protectedTerms the set of terms to skip the wrapped filters for
|
||||||
* @param input the input TokenStream
|
* @param input the input TokenStream
|
||||||
* @param inputFactory a factory function to create the wrapped filters
|
* @param inputFactory a factory function to create the wrapped filter(s)
|
||||||
*/
|
*/
|
||||||
public TermExclusionFilter(final CharArraySet excludeTerms, TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
|
public ProtectedTermFilter(final CharArraySet protectedTerms, TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
|
||||||
super(input, inputFactory);
|
super(input, inputFactory);
|
||||||
this.excludeTerms = excludeTerms;
|
this.protectedTerms = protectedTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean shouldFilter() {
|
protected boolean shouldFilter() {
|
||||||
return excludeTerms.contains(termAtt.buffer(), 0, termAtt.length()) == false;
|
boolean b = protectedTerms.contains(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
return b == false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -0,0 +1,163 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.function.Predicate; // javadocs
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for a {@link ProtectedTermFilter}
|
||||||
|
*
|
||||||
|
* <p>CustomAnalyzer example:
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* Analyzer ana = CustomAnalyzer.builder()
|
||||||
|
* .withTokenizer("standard")
|
||||||
|
* .when("protectedterm", "ignoreCase", "true", "protected", "protectedTerms.txt")
|
||||||
|
* .addTokenFilter("truncate", "prefixLength", "4")
|
||||||
|
* .addTokenFilter("lowercase")
|
||||||
|
* .endwhen()
|
||||||
|
* .build();
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* <p>Solr example, in which conditional filters are specified via the <code>wrappedFilters</code>
|
||||||
|
* parameter - a comma-separated list of case-insensitive TokenFilter SPI names - and conditional
|
||||||
|
* filter args are specified via <code>filterName.argName</code> parameters:
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="reverse_lower_with_exceptions" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
|
||||||
|
* wrappedFilters="truncate,lowercase" truncate.prefixLength="4" />
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*
|
||||||
|
* <p>When using the <code>wrappedFilters</code> parameter, each filter name must be unique, so if you
|
||||||
|
* need to specify the same filter more than once, you must add case-insensitive unique '-id' suffixes
|
||||||
|
* (note that the '-id' suffix is stripped prior to SPI lookup), e.g.:
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="double_synonym_with_exceptions" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
|
||||||
|
* wrappedFilters="synonymgraph-A,synonymgraph-B"
|
||||||
|
* synonymgraph-A.synonyms="synonyms-1.txt"
|
||||||
|
* synonymgraph-B.synonyms="synonyms-2.txt"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*
|
||||||
|
* <p>See related {@link org.apache.lucene.analysis.custom.CustomAnalyzer.Builder#whenTerm(Predicate)}
|
||||||
|
*/
|
||||||
|
public class ProtectedTermFilterFactory extends ConditionalTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
|
||||||
|
public static final String PROTECTED_TERMS = "protected";
|
||||||
|
public static final char FILTER_ARG_SEPARATOR = '.';
|
||||||
|
public static final char FILTER_NAME_ID_SEPARATOR = '-';
|
||||||
|
|
||||||
|
private final String termFiles;
|
||||||
|
private final boolean ignoreCase;
|
||||||
|
private final String wrappedFilters;
|
||||||
|
|
||||||
|
private CharArraySet protectedTerms;
|
||||||
|
|
||||||
|
public ProtectedTermFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
termFiles = require(args, PROTECTED_TERMS);
|
||||||
|
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||||
|
wrappedFilters = get(args, "wrappedFilters");
|
||||||
|
if (wrappedFilters != null) {
|
||||||
|
handleWrappedFilterArgs(args);
|
||||||
|
}
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleWrappedFilterArgs(Map<String, String> args) {
|
||||||
|
LinkedHashMap<String, Map<String, String>> wrappedFilterArgs = new LinkedHashMap<>();
|
||||||
|
splitAt(',', wrappedFilters).forEach(filterName -> { // Format: SPIname[-id]
|
||||||
|
filterName = filterName.trim().toLowerCase(Locale.ROOT); // Treat case-insensitively
|
||||||
|
if (wrappedFilterArgs.containsKey(filterName)) {
|
||||||
|
throw new IllegalArgumentException("wrappedFilters contains duplicate '"
|
||||||
|
+ filterName + "'. Add unique '-id' suffixes (stripped prior to SPI lookup).");
|
||||||
|
}
|
||||||
|
wrappedFilterArgs.put(filterName, new HashMap<>());
|
||||||
|
});
|
||||||
|
for (Iterator<Map.Entry<String, String>> iterator = args.entrySet().iterator(); iterator.hasNext(); ) {
|
||||||
|
Map.Entry<String, String> entry = iterator.next();
|
||||||
|
String filterArgKey = entry.getKey();
|
||||||
|
String argValue = entry.getValue();
|
||||||
|
List<String> splitKey = splitAt(FILTER_ARG_SEPARATOR, filterArgKey); // Format: filterName.argKey
|
||||||
|
if (splitKey.size() == 2) { // Skip if no slash
|
||||||
|
String filterName = splitKey.get(0).toLowerCase(Locale.ROOT);
|
||||||
|
if (wrappedFilterArgs.containsKey(filterName)) { // Skip if not in "wrappedFilter" arg
|
||||||
|
Map<String, String> filterArgs = wrappedFilterArgs.computeIfAbsent(filterName, k -> new HashMap<>());
|
||||||
|
String argKey = splitKey.get(1);
|
||||||
|
filterArgs.put(argKey, argValue); // argKey is guaranteed unique, don't need to check for duplicates
|
||||||
|
iterator.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (args.isEmpty()) {
|
||||||
|
populateInnerFilters(wrappedFilterArgs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void populateInnerFilters(LinkedHashMap<String, Map<String, String>> wrappedFilterArgs) {
|
||||||
|
List<TokenFilterFactory> innerFilters = new ArrayList<>();
|
||||||
|
wrappedFilterArgs.forEach((filterName, filterArgs) -> {
|
||||||
|
int idSuffixPos = filterName.indexOf(FILTER_NAME_ID_SEPARATOR); // Format: SPIname[-id]
|
||||||
|
if (idSuffixPos != -1) { // Strip '-id' suffix, if any, prior to SPI lookup
|
||||||
|
filterName = filterName.substring(0, idSuffixPos);
|
||||||
|
}
|
||||||
|
innerFilters.add(TokenFilterFactory.forName(filterName, filterArgs));
|
||||||
|
});
|
||||||
|
setInnerFilters(innerFilters);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isIgnoreCase() {
|
||||||
|
return ignoreCase;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharArraySet getProtectedTerms() {
|
||||||
|
return protectedTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
|
||||||
|
return new ProtectedTermFilter(protectedTerms, input, inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doInform(ResourceLoader loader) throws IOException {
|
||||||
|
protectedTerms = getWordSet(loader, termFiles, ignoreCase);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,58 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.lucene.analysis.miscellaneous;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Factory for a {@link TermExclusionFilter}
|
|
||||||
*/
|
|
||||||
public class TermExclusionFilterFactory extends ConditionalTokenFilterFactory {
|
|
||||||
|
|
||||||
public static final String EXCLUDED_TOKENS = "protected";
|
|
||||||
|
|
||||||
private final String wordFiles;
|
|
||||||
private final boolean ignoreCase;
|
|
||||||
|
|
||||||
private CharArraySet excludeTerms;
|
|
||||||
|
|
||||||
public TermExclusionFilterFactory(Map<String, String> args) {
|
|
||||||
super(args);
|
|
||||||
wordFiles = get(args, EXCLUDED_TOKENS);
|
|
||||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
|
||||||
if (!args.isEmpty()) {
|
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
|
|
||||||
return new TermExclusionFilter(excludeTerms, input, inner);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void doInform(ResourceLoader loader) throws IOException {
|
|
||||||
excludeTerms = getWordSet(loader, wordFiles, ignoreCase);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -290,12 +290,24 @@ public abstract class AbstractAnalysisFactory {
|
||||||
* @return a list of file names with the escaping backslashed removed
|
* @return a list of file names with the escaping backslashed removed
|
||||||
*/
|
*/
|
||||||
protected final List<String> splitFileNames(String fileNames) {
|
protected final List<String> splitFileNames(String fileNames) {
|
||||||
if (fileNames == null)
|
return splitAt(',', fileNames);
|
||||||
return Collections.<String>emptyList();
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits a list separated by zero or more given separator characters.
|
||||||
|
* List items can contain comma characters escaped by backslash '\'.
|
||||||
|
* Whitespace is NOT trimmed from the returned list items.
|
||||||
|
*
|
||||||
|
* @param list the string containing the split list items
|
||||||
|
* @return a list of items with the escaping backslashes removed
|
||||||
|
*/
|
||||||
|
protected final List<String> splitAt(char separator, String list) {
|
||||||
|
if (list == null)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
List<String> result = new ArrayList<>();
|
List<String> result = new ArrayList<>();
|
||||||
for (String file : fileNames.split("(?<!\\\\),")) {
|
for (String item : list.split("(?<!\\\\)[" + separator + "]")) {
|
||||||
result.add(file.replaceAll("\\\\(?=,)", ""));
|
result.add(item.replaceAll("\\\\(?=[" + separator + "])", ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -78,7 +78,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.TermExclusionFilterFactory
|
org.apache.lucene.analysis.miscellaneous.ProtectedTermFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
|
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
|
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
foo
|
||||||
|
bar
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
golden
|
||||||
|
compote
|
|
@ -520,7 +520,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
||||||
CustomAnalyzer analyzer = CustomAnalyzer.builder()
|
CustomAnalyzer analyzer = CustomAnalyzer.builder()
|
||||||
.withTokenizer("whitespace")
|
.withTokenizer("whitespace")
|
||||||
.addTokenFilter("lowercase")
|
.addTokenFilter("lowercase")
|
||||||
.when("termexclusion", "protected", "org/apache/lucene/analysis/custom/teststop.txt")
|
.when("protectedterm", "protected", "org/apache/lucene/analysis/custom/teststop.txt")
|
||||||
.addTokenFilter("reversestring")
|
.addTokenFilter("reversestring")
|
||||||
.endwhen()
|
.endwhen()
|
||||||
.build();
|
.build();
|
||||||
|
|
|
@ -248,14 +248,14 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testReadaheadWithFiltering() throws IOException {
|
public void testReadaheadWithFiltering() throws IOException {
|
||||||
|
|
||||||
CharArraySet exclusions = new CharArraySet(2, true);
|
CharArraySet protectedTerms = new CharArraySet(2, true);
|
||||||
exclusions.add("three");
|
protectedTerms.add("three");
|
||||||
|
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer source = new ClassicTokenizer();
|
Tokenizer source = new ClassicTokenizer();
|
||||||
TokenStream sink = new TermExclusionFilter(exclusions, source, in -> new ShingleFilter(in, 2));
|
TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
|
||||||
return new TokenStreamComponents(source, sink);
|
return new TokenStreamComponents(source, sink);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -287,15 +287,15 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testFilteredTokenFilters() throws IOException {
|
public void testFilteredTokenFilters() throws IOException {
|
||||||
|
|
||||||
CharArraySet exclusions = new CharArraySet(2, true);
|
CharArraySet protectedTerms = new CharArraySet(2, true);
|
||||||
exclusions.add("foobar");
|
protectedTerms.add("foobar");
|
||||||
|
|
||||||
TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
|
TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
|
||||||
ts = new TermExclusionFilter(exclusions, ts, in -> new LengthFilter(in, 1, 4));
|
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
|
||||||
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
|
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
|
||||||
|
|
||||||
ts = whitespaceMockTokenizer("foobar abc");
|
ts = whitespaceMockTokenizer("foobar abc");
|
||||||
ts = new TermExclusionFilter(exclusions, ts, in -> new LengthFilter(in, 1, 4));
|
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
|
||||||
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
|
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,9 +26,9 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
|
||||||
public class TestTermExclusionFilter extends BaseTokenStreamTestCase {
|
public class TestProtectedTermFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testExcludeTerms() throws IOException {
|
public void testBasic() throws IOException {
|
||||||
|
|
||||||
CannedTokenStream cts = new CannedTokenStream(
|
CannedTokenStream cts = new CannedTokenStream(
|
||||||
new Token("Alice", 1, 0, 5),
|
new Token("Alice", 1, 0, 5),
|
||||||
|
@ -37,10 +37,10 @@ public class TestTermExclusionFilter extends BaseTokenStreamTestCase {
|
||||||
new Token("David", 1, 16, 21)
|
new Token("David", 1, 16, 21)
|
||||||
);
|
);
|
||||||
|
|
||||||
CharArraySet exclusions = new CharArraySet(5, true);
|
CharArraySet protectedTerms = new CharArraySet(5, true);
|
||||||
exclusions.add("bob");
|
protectedTerms.add("bob");
|
||||||
|
|
||||||
TokenStream ts = new TermExclusionFilter(exclusions, cts, LowerCaseFilter::new);
|
TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new);
|
||||||
assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });
|
assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });
|
||||||
|
|
||||||
}
|
}
|
|
@ -0,0 +1,128 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the simple truncation filter factory is working.
|
||||||
|
*/
|
||||||
|
public class TestProtectedTermFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
public void testInform() throws Exception {
|
||||||
|
ProtectedTermFilterFactory factory = (ProtectedTermFilterFactory)tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "ignoreCase", "true", "wrappedFilters", "lowercase");
|
||||||
|
CharArraySet protectedTerms = factory.getProtectedTerms();
|
||||||
|
assertTrue("protectedTerms is null and it shouldn't be", protectedTerms != null);
|
||||||
|
assertTrue("protectedTerms Size: " + protectedTerms.size() + " is not: " + 2, protectedTerms.size() == 2);
|
||||||
|
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||||
|
|
||||||
|
factory = (ProtectedTermFilterFactory)tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt, protected-2.txt", "ignoreCase", "true", "wrappedFilters", "lowercase");
|
||||||
|
protectedTerms = factory.getProtectedTerms();
|
||||||
|
assertTrue("protectedTerms is null and it shouldn't be", protectedTerms != null);
|
||||||
|
assertTrue("protectedTerms Size: " + protectedTerms.size() + " is not: " + 4, protectedTerms.size() == 4);
|
||||||
|
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||||
|
|
||||||
|
// defaults
|
||||||
|
factory = (ProtectedTermFilterFactory)tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt");
|
||||||
|
assertEquals(false, factory.isIgnoreCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasic() throws Exception {
|
||||||
|
String str = "Foo Clara Bar David";
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(str);
|
||||||
|
stream = tokenFilterFactory("ProtectedTerm", "ignoreCase", "true",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "lowercase").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Foo", "clara", "Bar", "david"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultipleWrappedFiltersWithParams() throws Exception {
|
||||||
|
String str = "Foo Clara Bar David";
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(str);
|
||||||
|
stream = tokenFilterFactory("ProtectedTerm", "ignoreCase", "true",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "lowercase, truncate",
|
||||||
|
"truncate.prefixLength", "2").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Foo", "cl", "Bar", "da"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultipleSameNamedFiltersWithParams() throws Exception {
|
||||||
|
String str = "Foo Clara Bar David";
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(str);
|
||||||
|
stream = tokenFilterFactory("ProtectedTerm", "ignoreCase", "true",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "truncate-A, reversestring, truncate-B",
|
||||||
|
"truncate-A.prefixLength", "3", "truncate-B.prefixLength", "2").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Foo", "al", "Bar", "va"});
|
||||||
|
|
||||||
|
// same-named wrapped filters, one with an ID and another without
|
||||||
|
stream = whitespaceMockTokenizer(str);
|
||||||
|
stream = tokenFilterFactory("ProtectedTerm", "ignoreCase", "true",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "truncate, reversestring, truncate-A",
|
||||||
|
"truncate.prefixLength", "3", "truncate-A.prefixLength", "2").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Foo", "al", "Bar", "va"});
|
||||||
|
|
||||||
|
// Case-insensitive wrapped "filter-id"
|
||||||
|
stream = whitespaceMockTokenizer(str);
|
||||||
|
stream = tokenFilterFactory("ProtectedTerm", "ignoreCase", "true",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "TRUNCATE-a, reversestring, truncate-b",
|
||||||
|
"truncate-A.prefixLength", "3", "TRUNCATE-B.prefixLength", "2").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Foo", "al", "Bar", "va"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm", "protected", "protected-1.txt", "bogusArg", "bogusValue"));
|
||||||
|
assertTrue(exception.getMessage().contains("Unknown parameters"));
|
||||||
|
|
||||||
|
// same-named wrapped filters
|
||||||
|
exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "truncate, truncate"));
|
||||||
|
assertTrue(exception.getMessage().contains("wrappedFilters contains duplicate"));
|
||||||
|
|
||||||
|
// case-insensitive same-named wrapped filters
|
||||||
|
exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "TRUNCATE, truncate"));
|
||||||
|
assertTrue(exception.getMessage().contains("wrappedFilters contains duplicate"));
|
||||||
|
|
||||||
|
// case-insensitive same-named wrapped filter IDs
|
||||||
|
exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "truncate-ABC, truncate-abc"));
|
||||||
|
assertTrue(exception.getMessage().contains("wrappedFilters contains duplicate"));
|
||||||
|
|
||||||
|
// mismatched wrapped filter and associated args
|
||||||
|
exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "truncate-A, reversestring, truncate-B",
|
||||||
|
"truncate.prefixLength", "3", "truncate-A.prefixLength", "2"));
|
||||||
|
assertTrue(exception.getMessage().contains("Unknown parameters: {truncate.prefixLength=3}"));
|
||||||
|
|
||||||
|
// missing required arg(s) for wrapped filter
|
||||||
|
String str = "Foo Clara Bar David";
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(str);
|
||||||
|
exception = expectThrows(IllegalArgumentException.class, () ->
|
||||||
|
tokenFilterFactory("ProtectedTerm",
|
||||||
|
"protected", "protected-1.txt", "wrappedFilters", "length").create(stream));
|
||||||
|
assertTrue(exception.getMessage().contains("Configuration Error: missing parameter"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
foobar
|
||||||
|
jaxfopbuz
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
golden
|
||||||
|
compote
|
|
@ -0,0 +1,86 @@
|
||||||
|
<?xml version="1.0" ?>
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!-- The Solr schema file, version 1.6 -->
|
||||||
|
|
||||||
|
<schema name="schema-protected-term" version="1.6">
|
||||||
|
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||||
|
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
|
||||||
|
<field name="signatureField" type="string" indexed="true" stored="false"/>
|
||||||
|
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
|
||||||
|
|
||||||
|
<fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||||
|
<field name="_version_" type="long" indexed="false" stored="false" docValues="true"/>
|
||||||
|
|
||||||
|
<fieldType name="prefix4_lower_with_exceptions" class="solr.TextField">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory" ignoreCase="true"
|
||||||
|
protected="protected-1.txt,protected-2.txt"
|
||||||
|
wrappedFilters="truncate,lowercase" truncate.prefixLength="4"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
<field name="prefix4_lower" type="prefix4_lower_with_exceptions" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<fieldType name="prefix3_rev_prefix2_with_exceptions" class="solr.TextField">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory" ignoreCase="true"
|
||||||
|
protected="protected-1.txt,protected-2.txt"
|
||||||
|
wrappedFilters="truncate-A, reversestring, truncate-B"
|
||||||
|
truncate-A.prefixLength="3" truncate-B.prefixLength="2"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
<field name="prefix3_rev_prefix2" type="prefix3_rev_prefix2_with_exceptions" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<fieldType name="prefix3_rev_prefix2_mixed_IDs_with_exceptions" class="solr.TextField">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory" ignoreCase="true"
|
||||||
|
protected="protected-1.txt,protected-2.txt"
|
||||||
|
wrappedFilters="truncate, reversestring, truncate-A"
|
||||||
|
truncate.prefixLength="3" truncate-A.prefixLength="2"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
<field name="prefix3_rev_prefix2_mixed_IDs" type="prefix3_rev_prefix2_mixed_IDs_with_exceptions" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<fieldType name="prefix3_rev_prefix2_mixed_case_with_exceptions" class="solr.TextField">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory" ignoreCase="true"
|
||||||
|
protected="protected-1.txt,protected-2.txt"
|
||||||
|
wrappedFilters="TRUNCATE-a, reversestring, truncate-b"
|
||||||
|
truncate-A.prefixLength="3" TRUNCATE-B.prefixLength="2"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
<field name="prefix3_rev_prefix2_mixed_case" type="prefix3_rev_prefix2_mixed_case_with_exceptions" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<uniqueKey>id</uniqueKey>
|
||||||
|
</schema>
|
|
@ -0,0 +1,84 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ProtectedTermFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class ProtectedTermFilterFactoryTest extends SolrTestCaseJ4 {
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
initCore("solrconfig.xml","schema-protected-term.xml");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasic() throws Exception {
|
||||||
|
String text = "Wuthering FooBar distant goldeN ABC compote";
|
||||||
|
Map<String,String> args = new HashMap<>();
|
||||||
|
args.put("ignoreCase", "true");
|
||||||
|
args.put("protected", "protected-1.txt,protected-2.txt"); // Protected: foobar, jaxfopbuz, golden, compote
|
||||||
|
args.put("wrappedFilters", "lowercase");
|
||||||
|
|
||||||
|
ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1"));
|
||||||
|
ProtectedTermFilterFactory factory = new ProtectedTermFilterFactory(args);
|
||||||
|
factory.inform(loader);
|
||||||
|
|
||||||
|
TokenStream ts = factory.create(whitespaceMockTokenizer(text));
|
||||||
|
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
|
||||||
|
new String[] { "wuthering", "FooBar", "distant", "goldeN", "abc", "compote" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTwoWrappedFilters() {
|
||||||
|
// Index-time: Filters: truncate:4 & lowercase. Protected (ignoreCase:true): foobar, jaxfopbuz, golden, compote
|
||||||
|
// Query-time: No filters
|
||||||
|
assertU(adoc("id", "1", "prefix4_lower", "Wuthering FooBar distant goldeN ABC compote"));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ(req("prefix4_lower:(+wuth +FooBar +dist +goldeN +abc +compote)")
|
||||||
|
, "//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDuplicateFilters() {
|
||||||
|
// Index-time: Filters: truncate:3 & reversestring & truncate:2. Protected (ignoreCase:true): foobar, jaxfopbuz, golden, compote
|
||||||
|
// Query-time: No filters
|
||||||
|
assertU(adoc("id", "1",
|
||||||
|
"prefix3_rev_prefix2", "Wuthering FooBar distant goldeN ABC compote",
|
||||||
|
"prefix3_rev_prefix2_mixed_IDs", "Wuthering FooBar distant goldeN ABC compote",
|
||||||
|
"prefix3_rev_prefix2_mixed_case", "Wuthering FooBar distant goldeN ABC compote"));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ(req("prefix3_rev_prefix2:(+tu +FooBar +si +goldeN +CB +compote)")
|
||||||
|
, "//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
assertQ(req("prefix3_rev_prefix2_mixed_IDs:(+tu +FooBar +si +goldeN +CB +compote)")
|
||||||
|
, "//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
assertQ(req("prefix3_rev_prefix2_mixed_case:(+tu +FooBar +si +goldeN +CB +compote)")
|
||||||
|
, "//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1112,6 +1112,53 @@ This filter applies the Porter Stemming Algorithm for English. The results are s
|
||||||
|
|
||||||
*Out:* "jump", "jump", "jump"
|
*Out:* "jump", "jump", "jump"
|
||||||
|
|
||||||
|
== Protected Term Filter
|
||||||
|
|
||||||
|
This filter enables a form of conditional filtering: it only applies its wrapped filters to terms that are *not contained* in a protected set.
|
||||||
|
|
||||||
|
*Factory class:* `solr.ProtectedTermFilterFactory`
|
||||||
|
|
||||||
|
*Arguments:*
|
||||||
|
|
||||||
|
`protected`:: (required) Comma-separated list of files containing protected terms, one per line.
|
||||||
|
|
||||||
|
`wrappedFilters`:: (required) Case-insensitive comma-separated list of `TokenFilterFactory` SPI names (strip trailing `(Token)FilterFactory` from the factory name - see the https://docs.oracle.com/javase/8/docs/api/java/util/ServiceLoader.html[java.util.ServiceLoader interface]). Each filter name must be unique, so if you need to specify the same filter more than once, you must add case-insensitive unique `-id` suffixes to each same-SPI-named filter (note that the `-id` suffix is stripped prior to SPI lookup).
|
||||||
|
|
||||||
|
`ignoreCase`:: (true/false, default false) Ignore case when testing for protected words. If true, the protected list should contain lowercase words.
|
||||||
|
|
||||||
|
*Example:*
|
||||||
|
|
||||||
|
All terms except those in `protectedTerms.txt` are truncated at 4 characters and lowercased:
|
||||||
|
|
||||||
|
[source,xml]
|
||||||
|
----
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory"
|
||||||
|
ignoreCase="true" protected="protectedTerms.txt"
|
||||||
|
wrappedFilters="truncate,lowercase"
|
||||||
|
truncate.prefixLength="4"/>
|
||||||
|
</analyzer>
|
||||||
|
----
|
||||||
|
|
||||||
|
*Example:*
|
||||||
|
|
||||||
|
This example includes multiple same-named wrapped filters with unique `-id` suffixes. Note that both the filter SPI names and `-id` suffixes are treated case-insensitively.
|
||||||
|
|
||||||
|
For all terms except those in `protectedTerms.txt`, synonyms are added, terms are reversed, and then synonyms are added for the reversed terms:
|
||||||
|
|
||||||
|
[source,xml]
|
||||||
|
----
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ProtectedTermFilterFactory"
|
||||||
|
ignoreCase="true" protected="protectedTerms.txt"
|
||||||
|
wrappedFilters="SynonymGraph-fwd,ReverseString,SynonymGraph-rev"
|
||||||
|
synonymgraph-FWD.synonyms="fwd-syns.txt"
|
||||||
|
synonymgraph-FWD.synonyms="rev-syns.txt"/>
|
||||||
|
</analyzer>
|
||||||
|
----
|
||||||
|
|
||||||
== Remove Duplicates Token Filter
|
== Remove Duplicates Token Filter
|
||||||
|
|
||||||
The filter removes duplicate tokens in the stream. Tokens are considered to be duplicates ONLY if they have the same text and position values.
|
The filter removes duplicate tokens in the stream. Tokens are considered to be duplicates ONLY if they have the same text and position values.
|
||||||
|
|
Loading…
Reference in New Issue