mirror of https://github.com/apache/lucene.git
LUCENE-8265: WordDelimiter*Filter ignores keywords
This commit is contained in:
parent
4fba55c864
commit
fc0878cc2f
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
public static final int STEM_ENGLISH_POSSESSIVE = 256;
|
||||
|
||||
|
||||
/**
|
||||
* Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
|
||||
*/
|
||||
public static final int IGNORE_KEYWORDS = 512;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*
|
||||
|
@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
private final int flags;
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
@ -243,7 +250,9 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
|
||||
return true;
|
||||
}
|
||||
int termLength = termAttribute.length();
|
||||
char[] termBuffer = termAttribute.buffer();
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
|
@ -39,7 +40,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* work correctly when this filter is used in the search-time analyzer. Unlike
|
||||
* the deprecated {@link WordDelimiterFilter}, this token filter produces a
|
||||
* correct token graph as output. However, it cannot consume an input token
|
||||
* graph correctly.
|
||||
* graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
|
||||
*
|
||||
* <p>
|
||||
* Words are split into subwords with the following rules:
|
||||
|
@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
public static final int STEM_ENGLISH_POSSESSIVE = 256;
|
||||
|
||||
|
||||
/**
|
||||
* Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
|
||||
*/
|
||||
public static final int IGNORE_KEYWORDS = 512;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*
|
||||
|
@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
private char[][] bufferedTermParts = new char[4][];
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
|
||||
|
@ -225,7 +232,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
PRESERVE_ORIGINAL |
|
||||
SPLIT_ON_CASE_CHANGE |
|
||||
SPLIT_ON_NUMERICS |
|
||||
STEM_ENGLISH_POSSESSIVE)) != 0) {
|
||||
STEM_ENGLISH_POSSESSIVE |
|
||||
IGNORE_KEYWORDS)) != 0) {
|
||||
throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
|
||||
}
|
||||
this.flags = configurationFlags;
|
||||
|
@ -335,7 +343,9 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
if (input.incrementToken() == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
|
||||
return true;
|
||||
}
|
||||
int termLength = termAttribute.length();
|
||||
char[] termBuffer = termAttribute.buffer();
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
||||
|
@ -57,7 +56,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
***/
|
||||
|
||||
@Test
|
||||
public void testOffsets() throws IOException {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
// test that subwords and catenated subwords have
|
||||
|
@ -77,7 +75,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new int[] { 6, 6, 6 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
@ -88,7 +85,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new int[] { 15 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange2() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
@ -99,7 +95,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new int[] { 17 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange3() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
@ -110,7 +105,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new int[] { 16 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange4() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
@ -129,7 +123,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplits() throws Exception {
|
||||
doSplit("basic-split","basic","split");
|
||||
doSplit("camelCase","camel","Case");
|
||||
|
@ -175,7 +168,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
/*
|
||||
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
|
||||
*/
|
||||
@Test
|
||||
public void testPossessives() throws Exception {
|
||||
doSplitPossessive(1, "ra's", "ra");
|
||||
doSplitPossessive(0, "ra's", "ra", "s");
|
||||
|
@ -204,7 +196,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
|
||||
|
@ -323,6 +314,38 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
IOUtils.close(a, a2, a3);
|
||||
}
|
||||
|
||||
public void testKeywordFilter() throws Exception {
|
||||
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
|
||||
"abc-def klm-nop kpop",
|
||||
new String[] {"abc", "def", "klm", "nop", "kpop"});
|
||||
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
|
||||
"abc-def klm-nop kpop",
|
||||
new String[] {"abc", "def", "klm-nop", "kpop"},
|
||||
new int[]{0, 4, 8, 16},
|
||||
new int[]{3, 7, 15, 20},
|
||||
null,
|
||||
new int[]{1, 1, 1, 1},
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
private Analyzer keywordTestAnalyzer(int flags) throws Exception {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String field) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
|
||||
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||
@Override public boolean isKeyword() {
|
||||
// Marks terms starting with the letter 'k' as keywords
|
||||
return term.toString().charAt(0) == 'k';
|
||||
}
|
||||
};
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** concat numbers + words + all */
|
||||
public void testLotsOfConcatenating() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
|
@ -346,7 +369,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
false);
|
||||
a.close();
|
||||
}
|
||||
|
||||
|
||||
/** concat numbers + words + all + preserve original */
|
||||
public void testLotsOfConcatenating2() throws Exception {
|
||||
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
|
|
|
@ -309,6 +309,38 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
IOUtils.close(a, a2, a3);
|
||||
}
|
||||
|
||||
public void testKeywordFilter() throws Exception {
|
||||
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
|
||||
"abc-def klm-nop kpop",
|
||||
new String[] {"abc", "def", "klm", "nop", "kpop"});
|
||||
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
|
||||
"abc-def klm-nop kpop",
|
||||
new String[] {"abc", "def", "klm-nop", "kpop"},
|
||||
new int[]{0, 4, 8, 16},
|
||||
new int[]{3, 7, 15, 20},
|
||||
null,
|
||||
new int[]{1, 1, 1, 1},
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
private Analyzer keywordTestAnalyzer(int flags) throws Exception {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String field) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
|
||||
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||
@Override public boolean isKeyword() {
|
||||
// Marks terms starting with the letter 'k' as keywords
|
||||
return term.toString().charAt(0) == 'k';
|
||||
}
|
||||
};
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** concat numbers + words + all */
|
||||
public void testLotsOfConcatenating() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
|
|
Loading…
Reference in New Issue