mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-9575 Add PatternTypingFilter to annotate tokens with flags and types (#1995)
LUCENE-9575 Add PatternTypingFilter
This commit is contained in:
parent
9d4811e02f
commit
c087f6f8c0
@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Set a type attribute to a parameterized value when tokens are matched by any of a several regex patterns. The
|
||||
* value set in the type attribute is parameterized with the match groups of the regex used for matching.
|
||||
* In combination with TypeAsSynonymFilter and DropIfFlagged filter this can supply complex synonym patterns
|
||||
* that are protected from subsequent analysis, and optionally drop the original term based on the flag
|
||||
* set in this filter. See {@link PatternTypingFilterFactory} for full documentation.
|
||||
*
|
||||
* @see PatternTypingFilterFactory
|
||||
* @since 8.8.0
|
||||
*/
|
||||
public class PatternTypingFilter extends TokenFilter {
|
||||
|
||||
private final PatternTypingRule[] replacementAndFlagByPattern;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final FlagsAttribute flagAtt = addAttribute(FlagsAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
public PatternTypingFilter(TokenStream input, PatternTypingRule... replacementAndFlagByPattern) {
|
||||
super(input);
|
||||
this.replacementAndFlagByPattern = replacementAndFlagByPattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
for (PatternTypingRule rule : replacementAndFlagByPattern) {
|
||||
Matcher matcher = rule.getPattern().matcher(termAtt);
|
||||
if (matcher.find()) {
|
||||
// allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string creation
|
||||
typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate()));
|
||||
flagAtt.setFlags(rule.getFlags());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Value holding class for pattern typing rules.
|
||||
*/
|
||||
public static class PatternTypingRule {
|
||||
private final Pattern pattern;
|
||||
private final int flags;
|
||||
private final String typeTemplate;
|
||||
|
||||
public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {
|
||||
this.pattern = pattern;
|
||||
this.flags = flags;
|
||||
this.typeTemplate = typeTemplate;
|
||||
}
|
||||
|
||||
public Pattern getPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public String getTypeTemplate() {
|
||||
return typeTemplate;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
|
||||
import org.apache.lucene.util.ResourceLoader;
|
||||
import org.apache.lucene.util.ResourceLoaderAware;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
/**
|
||||
* Provides a filter that will analyze tokens with the analyzer from an arbitrary field type. By itself this
|
||||
* filter is not very useful. Normally it is combined with a filter that reacts to types or flags.
|
||||
*
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_taf" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="com.example.PatternTypingFilter" patternFile="patterns.txt"/>
|
||||
* <filter class="solr.TokenAnalyzerFilter" asType="text_en" preserveType="true"/>
|
||||
* <filter class="solr.TypeAsSynonymFilterFactory" prefix="__TAS__"
|
||||
* ignore="word,&lt;ALPHANUM&gt;,&lt;NUM&gt;,&lt;SOUTHEAST_ASIAN&gt;,&lt;IDEOGRAPHIC&gt;,&lt;HIRAGANA&gt;,&lt;KATAKANA&gt;,&lt;HANGUL&gt;,&lt;EMOJI&gt;"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* <p>
|
||||
* Note that a configuration such as above may interfere with multi-word synonyms. The patterns file has the format:
|
||||
* <pre>
|
||||
* (flags) (pattern) ::: (replacement)
|
||||
* </pre>
|
||||
* Therefore to set the first 2 flag bits on the original token matching 401k or 401(k) and adding a type of
|
||||
* 'legal2_401_k' whenever either one is encountered one would use:
|
||||
* <pre>
|
||||
* 3 (\d+)\(?([a-z])\)? ::: legal2_$1_$2
|
||||
* </pre>
|
||||
* Note that the number indicating the flag bits to set must not have leading spaces and be followed by a single
|
||||
* space, and must be 0 if no flags should be set. The flags number should not contain commas or a decimal point.
|
||||
* Lines for which the first character is <code>#</code> will be ignored as comments. Does not support producing
|
||||
* a synonym textually identical to the original term.
|
||||
*
|
||||
* @lucene.spi {@value #NAME}
|
||||
* @since 8.8
|
||||
*/
|
||||
public class PatternTypingFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
/**
|
||||
* SPI name
|
||||
*/
|
||||
public static final String NAME = "patternTyping";
|
||||
|
||||
private final String patternFile;
|
||||
private PatternTypingRule[] rules;
|
||||
|
||||
/**
|
||||
* Creates a new PatternTypingFilterFactory
|
||||
*/
|
||||
public PatternTypingFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
patternFile = require(args, "patternFile");
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default ctor for compatibility with SPI
|
||||
*/
|
||||
public PatternTypingFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
List<PatternTypingRule> ruleList = new ArrayList<>();
|
||||
List<String> lines = getLines(loader, patternFile);
|
||||
// format: # regex ::: typename[_$1[_$2 ...]] (technically _$1 does not need the '_' but it usually makes sense)
|
||||
// eg: 2 (\d+\(?([a-z])\)?\(?(\d+)\)? ::: legal3_$1_$2_3
|
||||
// which yields legal3_501_c_3 for 501(c)(3) or 501c3 and sets the second lowest bit in flags
|
||||
for (String line : lines) {
|
||||
int firstSpace = line.indexOf(" "); // no leading spaces allowed
|
||||
int flagsVal = Integer.parseInt(line.substring(0, firstSpace));
|
||||
line = line.substring(firstSpace + 1);
|
||||
String[] split = line.split(" ::: "); // arbitrary, unlikely to occur in a useful regex easy to read
|
||||
if (split.length != 2) {
|
||||
throw new RuntimeException("The PatternTypingFilter: Always two there are, no more, no less, a pattern and a replacement (separated by ' ::: ' )");
|
||||
}
|
||||
Pattern compiled = Pattern.compile(split[0]);
|
||||
ruleList.add(new PatternTypingRule(compiled, flagsVal, split[1]));
|
||||
}
|
||||
this.rules = ruleList.toArray(new PatternTypingRule[0]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new PatternTypingFilter(input, rules);
|
||||
}
|
||||
}
|
@ -97,6 +97,7 @@ org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
|
||||
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
|
||||
org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
|
||||
org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
|
||||
org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
|
||||
|
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Test that this filter sets a type for tokens matching patterns defined in a patterns.txt file
|
||||
*/
|
||||
public class TestPatternTypingFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Test the straight forward cases. When all flags match the token should be dropped
|
||||
*/
|
||||
public void testPatterns() throws Exception {
|
||||
|
||||
Token tokenA1 = new Token("One", 0, 2);
|
||||
Token tokenA2 = new Token("401(k)", 4, 9);
|
||||
Token tokenA3 = new Token("two", 11, 13);
|
||||
Token tokenB1 = new Token("three", 15, 19);
|
||||
Token tokenB2 = new Token("401k", 21, 24);
|
||||
|
||||
TokenStream ts = new CannedTokenStream(tokenA1, tokenA2, tokenA3, tokenB1, tokenB2);
|
||||
|
||||
//2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
|
||||
ts = new PatternTypingFilter(ts,
|
||||
new PatternTypingRule(Pattern.compile("^(\\d+)\\(?([a-z])\\)?$"),2,"legal2_$1_$2"));
|
||||
|
||||
assertTokenStreamContents(ts, new String[]{
|
||||
"One", "401(k)", "two", "three", "401k"}, null, null,
|
||||
new String[]{"word", "legal2_401_k", "word", "word", "legal2_401_k"},
|
||||
null, null, null, null, null, false, null,
|
||||
new int[]{0, 2, 0, 0, 2});
|
||||
}
|
||||
|
||||
public void testFirstPatternWins() throws IOException {
|
||||
Token tokenA1 = new Token("One", 0, 2);
|
||||
Token tokenA3 = new Token("forty-two", 11, 13);
|
||||
Token tokenB1 = new Token("4-2", 15, 19);
|
||||
|
||||
TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
|
||||
|
||||
//2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
|
||||
PatternTypingRule p1 = new PatternTypingRule(Pattern.compile("^(\\d+)-(\\d+)$"), 6, "$1_hnum_$2");
|
||||
PatternTypingRule p2 = new PatternTypingRule(Pattern.compile("^(\\w+)-(\\w+)$"), 2, "$1_hword_$2");
|
||||
|
||||
ts = new PatternTypingFilter(ts, p1,p2); // 101
|
||||
|
||||
assertTokenStreamContents(ts, new String[]{
|
||||
"One", "forty-two", "4-2"}, null, null,
|
||||
new String[]{"word", "forty_hword_two", "4_hnum_2"},
|
||||
null, null, null, null, null, false, null,
|
||||
new int[]{0, 2, 6});
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.StringMockResourceLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* This test just ensures the factory works
|
||||
*/
|
||||
public class TestPatternTypingFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testFactory() throws Exception {
|
||||
Token tokenA1 = new Token("One", 0, 2);
|
||||
Token tokenA3 = new Token("forty-two", 11, 13);
|
||||
Token tokenB1 = new Token("4-2", 15, 19);
|
||||
|
||||
TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
|
||||
|
||||
TokenFilterFactory tokenFilterFactory = tokenFilterFactory("patternTyping", Version.LATEST, new StringMockResourceLoader(
|
||||
"6 \\b(\\d+)-(\\d+) ::: $1_hnum_$2\n" +
|
||||
"2 \\b(\\w+)-(\\w+) ::: $1_hword_$2"
|
||||
), "patternFile", "patterns.txt");
|
||||
|
||||
ts = tokenFilterFactory.create(ts);
|
||||
assertTokenStreamContents(ts, new String[]{
|
||||
"One", "forty-two", "4-2"}, null, null,
|
||||
new String[]{"word", "forty_hword_two", "4_hnum_2"},
|
||||
null, null, null, null, null, false, null,
|
||||
new int[]{0, 2, 6});
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user