diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1d45ab87886..c119eaae443 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -155,6 +155,10 @@ Bug Fixes token graph, messing up phrase queries when it was used during query parsing (Ere Maijala via Mike McCandless) +* LUCENE-7708: ShingleFilter without unigram was producing a disconnected + token graph, messing up queries when it was used during query + parsing (Jim Ferenczi) + Improvements * LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 5d992919999..e3fa8033cf2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -343,7 +343,12 @@ public final class ShingleFilter extends TokenFilter { noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); - posLenAtt.setPositionLength(builtGramSize); + if (outputUnigrams) { + posLenAtt.setPositionLength(builtGramSize); + } else { + // position length for this token is the number of position created by shingles of smaller size. + posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1)); + } isOutputHere = true; gramSize.advance(); tokenAvailable = true; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java index 192de389d94..56459008be9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java @@ -30,7 +30,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class ShingleFilterTest extends BaseTokenStreamTestCase { @@ -1239,7 +1239,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.setFillerToken(null); filter.setTokenSeparator(null); - assertTokenStreamContents(filter, new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"}, new int[] {0, 0, 0, 7, 7, 7}, @@ -1247,4 +1246,95 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { new int[] {1, 0, 0, 1, 0, 0}, 20); } + + public void testPositionLength() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or not", "be or not to", "or not to be"}, + new int[] {0, 3, 6}, + new int[] {12, 15, 18}, + null, + new int[] {1, 1, 1}, + new int[] {1, 1, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to", + "or not to be", "not to", "not to be", "to be"}, + new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13}, + new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18}, + null, + new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1}, + new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to", + "or not to be", "not to be"}, + new int[] {0, 0, 3, 3, 6, 6, 9}, + new int[] {8, 12, 12, 15, 15, 18, 18}, + null, + new int[] {1, 0, 1, 0, 1, 0, 1, 0}, + new int[] {1, 2, 1, 2, 1, 2, 1, 2}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to", + "be or not to be", "or not to", "or not to be", "not to be"}, + new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9}, + new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18}, + null, + new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0}, + new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + } }