LUCENE-8265: WordDelimiter*Filter ignores keywords

This commit is contained in:
Michael Sokolov 2018-04-22 20:41:08 +00:00 committed by Mike McCandless
parent 4fba55c864
commit fc0878cc2f
4 changed files with 90 additions and 16 deletions

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter {
* "O'Neil's" => "O", "Neil"
*/
public static final int STEM_ENGLISH_POSSESSIVE = 256;
/**
* Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
*/
public static final int IGNORE_KEYWORDS = 512;
/**
* If not null is the set of tokens to protect from being delimited
*
@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter {
private final int flags;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
@ -243,7 +250,9 @@ public final class WordDelimiterFilter extends TokenFilter {
if (!input.incrementToken()) {
return false;
}
if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
return true;
}
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@ -39,7 +40,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* work correctly when this filter is used in the search-time analyzer. Unlike
* the deprecated {@link WordDelimiterFilter}, this token filter produces a
* correct token graph as output. However, it cannot consume an input token
* graph correctly.
* graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
*
* <p>
* Words are split into subwords with the following rules:
@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
* "O'Neil's" =&gt; "O", "Neil"
*/
public static final int STEM_ENGLISH_POSSESSIVE = 256;
/**
* Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
*/
public static final int IGNORE_KEYWORDS = 512;
/**
* If not null is the set of tokens to protect from being delimited
*
@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private char[][] bufferedTermParts = new char[4][];
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
@ -225,7 +232,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
PRESERVE_ORIGINAL |
SPLIT_ON_CASE_CHANGE |
SPLIT_ON_NUMERICS |
STEM_ENGLISH_POSSESSIVE)) != 0) {
STEM_ENGLISH_POSSESSIVE |
IGNORE_KEYWORDS)) != 0) {
throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
}
this.flags = configurationFlags;
@ -335,7 +343,9 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
if (input.incrementToken() == false) {
return false;
}
if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
return true;
}
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();

View File

@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;
import org.junit.Test;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
@ -57,7 +56,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
***/
@Test
public void testOffsets() throws IOException {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
@ -77,7 +75,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 6, 6, 6 });
}
@Test
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -88,7 +85,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 15 });
}
@Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -99,7 +95,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 17 });
}
@Test
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -110,7 +105,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 16 });
}
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -129,7 +123,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(wdf, output);
}
@Test
public void testSplits() throws Exception {
doSplit("basic-split","basic","split");
doSplit("camelCase","camel","Case");
@ -175,7 +168,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
/*
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
*/
@Test
public void testPossessives() throws Exception {
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
@ -204,7 +196,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
}
@Test
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@ -323,6 +314,38 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
IOUtils.close(a, a2, a3);
}
public void testKeywordFilter() throws Exception {
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
"abc-def klm-nop kpop",
new String[] {"abc", "def", "klm", "nop", "kpop"});
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
"abc-def klm-nop kpop",
new String[] {"abc", "def", "klm-nop", "kpop"},
new int[]{0, 4, 8, 16},
new int[]{3, 7, 15, 20},
null,
new int[]{1, 1, 1, 1},
null,
false);
}
private Analyzer keywordTestAnalyzer(int flags) throws Exception {
return new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
@Override public boolean isKeyword() {
// Marks terms starting with the letter 'k' as keywords
return term.toString().charAt(0) == 'k';
}
};
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null));
}
};
}
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
@ -346,7 +369,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
false);
a.close();
}
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;

View File

@ -309,6 +309,38 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
IOUtils.close(a, a2, a3);
}
public void testKeywordFilter() throws Exception {
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
"abc-def klm-nop kpop",
new String[] {"abc", "def", "klm", "nop", "kpop"});
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
"abc-def klm-nop kpop",
new String[] {"abc", "def", "klm-nop", "kpop"},
new int[]{0, 4, 8, 16},
new int[]{3, 7, 15, 20},
null,
new int[]{1, 1, 1, 1},
null,
false);
}
private Analyzer keywordTestAnalyzer(int flags) throws Exception {
return new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
@Override public boolean isKeyword() {
// Marks terms starting with the letter 'k' as keywords
return term.toString().charAt(0) == 'k';
}
};
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null));
}
};
}
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;