mirror of https://github.com/apache/lucene.git
LUCENE-8509: WordDelimiterGraphFilter no longer adjusts offsets by default
This commit is contained in:
parent
f5867a1413
commit
75a053dd69
|
@ -142,6 +142,10 @@ Changes in Runtime Behavior
|
||||||
anymore. This doesn't affect ordering as this is a constant factor which is
|
anymore. This doesn't affect ordering as this is a constant factor which is
|
||||||
the same for every document. (Luca Cavanna via Adrien Grand)
|
the same for every document. (Luca Cavanna via Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-8509: WordDelimiterGraphFilter will no longer set the offsets of internal
|
||||||
|
tokens by default, preventing a number of bugs when the filter is chained with
|
||||||
|
tokenfilters that change the length of their tokens (Alan Woodward)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on
|
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on
|
||||||
|
|
|
@ -191,6 +191,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
// used for concatenating runs of similar typed subwords (word,number)
|
// used for concatenating runs of similar typed subwords (word,number)
|
||||||
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
|
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
|
||||||
|
|
||||||
|
private final boolean adjustInternalOffsets;
|
||||||
|
|
||||||
// number of subwords last output by concat.
|
// number of subwords last output by concat.
|
||||||
private int lastConcatCount;
|
private int lastConcatCount;
|
||||||
|
|
||||||
|
@ -206,10 +208,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
private int savedEndOffset;
|
private int savedEndOffset;
|
||||||
private AttributeSource.State savedState;
|
private AttributeSource.State savedState;
|
||||||
private int lastStartOffset;
|
private int lastStartOffset;
|
||||||
|
private boolean adjustingOffsets;
|
||||||
// if length by start + end offsets doesn't match the term text then assume
|
|
||||||
// this is a synonym and don't adjust the offsets.
|
|
||||||
private boolean hasIllegalOffsets;
|
|
||||||
|
|
||||||
private int wordPos;
|
private int wordPos;
|
||||||
|
|
||||||
|
@ -217,11 +216,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
* Creates a new WordDelimiterGraphFilter
|
* Creates a new WordDelimiterGraphFilter
|
||||||
*
|
*
|
||||||
* @param in TokenStream to be filtered
|
* @param in TokenStream to be filtered
|
||||||
|
* @param adjustInternalOffsets if the offsets of partial terms should be adjusted
|
||||||
* @param charTypeTable table containing character types
|
* @param charTypeTable table containing character types
|
||||||
* @param configurationFlags Flags configuring the filter
|
* @param configurationFlags Flags configuring the filter
|
||||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||||
*/
|
*/
|
||||||
public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
|
public WordDelimiterGraphFilter(TokenStream in, boolean adjustInternalOffsets, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
|
||||||
super(in);
|
super(in);
|
||||||
if ((configurationFlags &
|
if ((configurationFlags &
|
||||||
~(GENERATE_WORD_PARTS |
|
~(GENERATE_WORD_PARTS |
|
||||||
|
@ -240,6 +240,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
this.protWords = protWords;
|
this.protWords = protWords;
|
||||||
this.iterator = new WordDelimiterIterator(
|
this.iterator = new WordDelimiterIterator(
|
||||||
charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
|
charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
|
||||||
|
this.adjustInternalOffsets = adjustInternalOffsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -251,7 +252,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||||
*/
|
*/
|
||||||
public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
|
public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
|
||||||
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
|
this(in, false, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Iterates all words parts and concatenations, buffering up the term parts we should return. */
|
/** Iterates all words parts and concatenations, buffering up the term parts we should return. */
|
||||||
|
@ -261,7 +262,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
|
|
||||||
// if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
|
// if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
|
||||||
// offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
|
// offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
|
||||||
hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength);
|
adjustingOffsets = adjustInternalOffsets && savedEndOffset - savedStartOffset == savedTermLength;
|
||||||
|
|
||||||
bufferedLen = 0;
|
bufferedLen = 0;
|
||||||
lastConcatCount = 0;
|
lastConcatCount = 0;
|
||||||
|
@ -391,7 +392,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
int startOffset;
|
int startOffset;
|
||||||
int endOffset;
|
int endOffset;
|
||||||
|
|
||||||
if (hasIllegalOffsets) {
|
if (adjustingOffsets == false) {
|
||||||
startOffset = savedStartOffset;
|
startOffset = savedStartOffset;
|
||||||
endOffset = savedEndOffset;
|
endOffset = savedEndOffset;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -53,12 +53,14 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.*;
|
||||||
public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||||
public static final String PROTECTED_TOKENS = "protected";
|
public static final String PROTECTED_TOKENS = "protected";
|
||||||
public static final String TYPES = "types";
|
public static final String TYPES = "types";
|
||||||
|
public static final String OFFSETS = "adjustOffsets";
|
||||||
|
|
||||||
private final String wordFiles;
|
private final String wordFiles;
|
||||||
private final String types;
|
private final String types;
|
||||||
private final int flags;
|
private final int flags;
|
||||||
byte[] typeTable = null;
|
byte[] typeTable = null;
|
||||||
private CharArraySet protectedWords = null;
|
private CharArraySet protectedWords = null;
|
||||||
|
private boolean adjustOffsets = false;
|
||||||
|
|
||||||
/** Creates a new WordDelimiterGraphFilterFactory */
|
/** Creates a new WordDelimiterGraphFilterFactory */
|
||||||
public WordDelimiterGraphFilterFactory(Map<String, String> args) {
|
public WordDelimiterGraphFilterFactory(Map<String, String> args) {
|
||||||
|
@ -94,6 +96,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
|
||||||
wordFiles = get(args, PROTECTED_TOKENS);
|
wordFiles = get(args, PROTECTED_TOKENS);
|
||||||
types = get(args, TYPES);
|
types = get(args, TYPES);
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
|
this.adjustOffsets = getBoolean(args, OFFSETS, true);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -117,7 +120,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenFilter create(TokenStream input) {
|
public TokenFilter create(TokenStream input) {
|
||||||
return new WordDelimiterGraphFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
||||||
flags, protectedWords);
|
flags, protectedWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -64,7 +64,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
// test that subwords and catenated subwords have
|
// test that subwords and catenated subwords have
|
||||||
// the correct offsets.
|
// the correct offsets.
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)),
|
||||||
|
true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foobar", "foo", "bar" },
|
new String[] { "foobar", "foo", "bar" },
|
||||||
|
@ -72,7 +73,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
new int[] { 12, 8, 12 });
|
new int[] { 12, 8, 12 });
|
||||||
|
|
||||||
// with illegal offsets:
|
// with illegal offsets:
|
||||||
wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foobar", "foo", "bar" },
|
new String[] { "foobar", "foo", "bar" },
|
||||||
new int[] { 5, 5, 5 },
|
new int[] { 5, 5, 5 },
|
||||||
|
@ -81,7 +82,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testOffsetChange() throws Exception {
|
public void testOffsetChange() throws Exception {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)),
|
||||||
|
true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "übelkeit" },
|
new String[] { "übelkeit" },
|
||||||
|
@ -91,7 +93,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testOffsetChange2() throws Exception {
|
public void testOffsetChange2() throws Exception {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)),
|
||||||
|
true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
// illegal offsets:
|
// illegal offsets:
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "übelkeit" },
|
new String[] { "übelkeit" },
|
||||||
|
@ -101,7 +104,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testOffsetChange3() throws Exception {
|
public void testOffsetChange3() throws Exception {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)),
|
||||||
|
true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "übelkeit" },
|
new String[] { "übelkeit" },
|
||||||
new int[] { 8 },
|
new int[] { 8 },
|
||||||
|
@ -110,7 +114,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testOffsetChange4() throws Exception {
|
public void testOffsetChange4() throws Exception {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)),
|
||||||
|
true, DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foobar", "foo", "bar"},
|
new String[] { "foobar", "foo", "bar"},
|
||||||
|
@ -120,7 +125,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void doSplit(final String input, String... output) throws Exception {
|
public void doSplit(final String input, String... output) throws Exception {
|
||||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input),
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), false,
|
||||||
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf, output);
|
assertTokenStreamContents(wdf, output);
|
||||||
|
@ -182,7 +187,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
// the correct offsets.
|
// the correct offsets.
|
||||||
Token token = new Token("foo-bar", 5, 12);
|
Token token = new Token("foo-bar", 5, 12);
|
||||||
token.setType("mytype");
|
token.setType("mytype");
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] {"foobar", "foo", "bar"},
|
new String[] {"foobar", "foo", "bar"},
|
||||||
|
@ -235,7 +240,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
public TokenStreamComponents createComponents(String field) {
|
public TokenStreamComponents createComponents(String field) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
|
||||||
tokenizer,
|
tokenizer, true, DEFAULT_WORD_DELIM_TABLE,
|
||||||
flags, protWords));
|
flags, protWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -272,7 +277,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
public TokenStreamComponents createComponents(String field) {
|
public TokenStreamComponents createComponents(String field) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
|
||||||
new LargePosIncTokenFilter(tokenizer),
|
new LargePosIncTokenFilter(tokenizer), true, DEFAULT_WORD_DELIM_TABLE,
|
||||||
flags, protWords));
|
flags, protWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -317,7 +322,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
public TokenStreamComponents createComponents(String field) {
|
public TokenStreamComponents createComponents(String field) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, true, DEFAULT_WORD_DELIM_TABLE, flags, protWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -350,8 +355,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
|
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
|
||||||
"abc-def klm-nop kpop",
|
"abc-def klm-nop kpop",
|
||||||
new String[] {"abc", "def", "klm-nop", "kpop"},
|
new String[] {"abc", "def", "klm-nop", "kpop"},
|
||||||
new int[]{0, 4, 8, 16},
|
new int[]{0, 0, 8, 16},
|
||||||
new int[]{3, 7, 15, 20},
|
new int[]{7, 7, 15, 20},
|
||||||
null,
|
null,
|
||||||
new int[]{1, 1, 1, 1},
|
new int[]{1, 1, 1, 1},
|
||||||
null,
|
null,
|
||||||
|
@ -384,7 +389,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStreamComponents createComponents(String field) {
|
public TokenStreamComponents createComponents(String field) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -414,8 +419,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
assertAnalyzesTo(a, "abc-def-123-456",
|
assertAnalyzesTo(a, "abc-def-123-456",
|
||||||
new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
|
new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
|
||||||
new int[] { 0, 0, 0, 0, 4, 8, 8, 12 },
|
new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||||
new int[] { 15, 15, 7, 3, 7, 15, 11, 15 },
|
new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
|
||||||
null,
|
null,
|
||||||
new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
|
new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
|
||||||
null,
|
null,
|
||||||
|
@ -954,7 +959,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyString() throws Exception {
|
public void testEmptyString() throws Exception {
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)),
|
||||||
|
GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
|
||||||
wdf.reset();
|
wdf.reset();
|
||||||
assertTrue(wdf.incrementToken());
|
assertTrue(wdf.incrementToken());
|
||||||
assertFalse(wdf.incrementToken());
|
assertFalse(wdf.incrementToken());
|
||||||
|
@ -967,7 +973,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
new Token("foo-bar", 0, 7));
|
new Token("foo-bar", 0, 7));
|
||||||
|
|
||||||
CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
|
CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
|
||||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
|
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
|
||||||
assertGraphStrings(wdf,
|
assertGraphStrings(wdf,
|
||||||
"foo17-bar foo bar",
|
"foo17-bar foo bar",
|
||||||
"foo17-bar foo-bar",
|
"foo17-bar foo-bar",
|
||||||
|
|
Loading…
Reference in New Issue