mirror of https://github.com/apache/lucene.git
LUCENE-8730: WordDelimiterGraphFilter always emits its original token first
This commit is contained in:
parent
7c0caeacfa
commit
9591052fed
|
@ -75,6 +75,11 @@ Changes in Runtime Behavior
|
|||
* LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
|
||||
from an IndexWriter. (Simon Willnauer)
|
||||
|
||||
* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first. This
|
||||
brings its behaviour into line with the deprecated WordDelimiterFilter, so that
|
||||
the only difference in output between the two is in the position length
|
||||
attribute. (Alan Woodward, Jim Ferenczi)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
|
||||
|
|
|
@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
lastConcatCount = 0;
|
||||
wordPos = 0;
|
||||
|
||||
if (has(PRESERVE_ORIGINAL)) {
|
||||
// add the original token now so that it is always emitted first
|
||||
// we will edit the term length after all other parts have been buffered
|
||||
buffer(0, 1, 0, savedTermLength);
|
||||
}
|
||||
|
||||
if (iterator.isSingleWord()) {
|
||||
buffer(wordPos, wordPos+1, iterator.current, iterator.end);
|
||||
wordPos++;
|
||||
|
@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
if (has(PRESERVE_ORIGINAL)) {
|
||||
// we now know how many tokens need to be injected, so we can set the original
|
||||
// token's position length
|
||||
if (wordPos == 0) {
|
||||
// can happen w/ strange flag combos and inputs :)
|
||||
wordPos++;
|
||||
}
|
||||
// add the original token now so that we can set the correct end position
|
||||
buffer(0, wordPos, 0, savedTermLength);
|
||||
bufferedParts[1] = wordPos;
|
||||
}
|
||||
|
||||
sorter.sort(0, bufferedLen);
|
||||
sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
|
||||
wordPos = 0;
|
||||
|
||||
// set back to 0 for iterating from the buffer
|
||||
|
|
|
@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
};
|
||||
}
|
||||
|
||||
public void testOriginalTokenEmittedFirst() throws Exception {
|
||||
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
|
||||
/* analyzer that uses whitespace + wdf */
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String field) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(a, "abc-def abcDEF abc123",
|
||||
new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" });
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** concat numbers + words + all */
|
||||
public void testLotsOfConcatenating() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
|
@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
};
|
||||
|
||||
assertAnalyzesTo(a, "abc-def-123-456",
|
||||
new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
|
||||
new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
|
||||
null,
|
||||
|
|
Loading…
Reference in New Issue