LUCENE-8730: WordDelimiterGraphFilter always emits its original token first

This commit is contained in:
Alan Woodward 2019-04-01 18:21:06 +01:00
parent 2480b74887
commit 3de0b36719
3 changed files with 33 additions and 4 deletions

View File

@ -90,6 +90,11 @@ Changes in Runtime Behavior
* LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
from an IndexWriter. (Simon Willnauer)
* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first. This
brings its behaviour into line with the deprecated WordDelimiterFilter, so that
the only difference in output between the two is in the position length
attribute. (Alan Woodward, Jim Ferenczi)
Other
* LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)

View File

@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
lastConcatCount = 0;
wordPos = 0;
if (has(PRESERVE_ORIGINAL)) {
// add the original token now so that it is always emitted first
// we will edit the term length after all other parts have been buffered
buffer(0, 1, 0, savedTermLength);
}
if (iterator.isSingleWord()) {
buffer(wordPos, wordPos+1, iterator.current, iterator.end);
wordPos++;
@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
}
if (has(PRESERVE_ORIGINAL)) {
// we now know how many tokens need to be injected, so we can set the original
// token's position length
if (wordPos == 0) {
// can happen w/ strange flag combos and inputs :)
wordPos++;
}
// add the original token now so that we can set the correct end position
buffer(0, wordPos, 0, savedTermLength);
bufferedParts[1] = wordPos;
}
sorter.sort(0, bufferedLen);
sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
wordPos = 0;
// set back to 0 for iterating from the buffer

View File

@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
};
}
public void testOriginalTokenEmittedFirst() throws Exception {
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
}
};
assertAnalyzesTo(a, "abc-def abcDEF abc123",
new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" });
a.close();
}
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
};
assertAnalyzesTo(a, "abc-def-123-456",
new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
null,