From 3de0b3671998cc9bc723d10f1b31ce48cbd4fa64 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 1 Apr 2019 18:21:06 +0100 Subject: [PATCH] LUCENE-8730: WordDelimiterGraphFilter always emits its original token first --- lucene/CHANGES.txt | 5 +++++ .../WordDelimiterGraphFilter.java | 13 ++++++++++--- .../TestWordDelimiterGraphFilter.java | 19 ++++++++++++++++++- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b54fa3ff262..c9be635e034 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -90,6 +90,11 @@ Changes in Runtime Behavior * LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened from an IndexWriter. (Simon Willnauer) +* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first. This + brings its behaviour into line with the deprecated WordDelimiterFilter, so that + the only difference in output between the two is in the position length + attribute. (Alan Woodward, Jim Ferenczi) + Other * LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 00ace5b29b6..a04eaff8dd6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter { lastConcatCount = 0; wordPos = 0; + if (has(PRESERVE_ORIGINAL)) { + // add the original token now so that it is always emitted first + // we will edit the term length after all other parts have been buffered + buffer(0, 1, 0, savedTermLength); + } + if (iterator.isSingleWord()) { buffer(wordPos, wordPos+1, iterator.current, iterator.end); wordPos++; @@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter { } if (has(PRESERVE_ORIGINAL)) { + // we now know how many tokens need to be injected, so we can set the original + // token's position length if (wordPos == 0) { // can happen w/ strange flag combos and inputs :) wordPos++; } - // add the original token now so that we can set the correct end position - buffer(0, wordPos, 0, savedTermLength); + bufferedParts[1] = wordPos; } - sorter.sort(0, bufferedLen); + sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen); wordPos = 0; // set back to 0 for iterating from the buffer diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java index e3f3f6587fb..41109b8ea9c 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java @@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { }; } + public void testOriginalTokenEmittedFirst() throws Exception { + final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; + + /* analyzer that uses whitespace + wdf */ + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null)); + } + }; + + assertAnalyzesTo(a, "abc-def abcDEF abc123", + new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" }); + a.close(); + } + /** concat numbers + words + all */ public void testLotsOfConcatenating() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; @@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { }; assertAnalyzesTo(a, "abc-def-123-456", - new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, + new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 15, 15, 15, 15, 15, 15, 15, 15 }, null,