diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3b212d3859d..517da0998e8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -780,6 +780,12 @@ Changes in Runtime Behavior boost on a field that omits norms. Because the index-time boost is multiplied into the norm, previously your boost would be silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir) + +* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial + position increment of 0: which is out of bounds (overlapping with a + non-existant previous term). Consumers such as IndexWriter and QueryParser + still check for and silently correct this situation today, but at some point + in the future they may throw an exception. (Mike McCandless, Robert Muir) Security fixes diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 692abd71ab2..5e140f17566 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -157,7 +157,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } } if (posIncrAtt != null) { - assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); + if (i == 0) { + assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); + } else { + assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); + } } if (posLengthAtt != null) { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java index f810c28c90c..11fb85688a8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java @@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter { private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value! + private boolean first = true; // only used when not preserving gaps public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){ super(input); @@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter { } else { while (input.incrementToken()) { if (accept()) { + if (first) { + // first token having posinc=0 is illegal. + if (posIncrAtt.getPositionIncrement() == 0) { + posIncrAtt.setPositionIncrement(1); + } + first = false; + } return true; } } @@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter { return false; } + @Override + public void reset() throws IOException { + super.reset(); + first = true; + } + /** * @see #setEnablePositionIncrements(boolean) */ diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java index fd6720c7d5e..c495bdd11a3 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java @@ -121,6 +121,8 @@ public final class WikipediaTokenizer extends Tokenizer { private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + + private boolean first; /** * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the @@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer { //output the untokenized Token first collapseAndSaveTokens(tokenType, type); } - posIncrAtt.setPositionIncrement(scanner.getPositionIncrement()); + int posinc = scanner.getPositionIncrement(); + if (first && posinc == 0) { + posinc = 1; // don't emit posinc=0 for the first token! + } + posIncrAtt.setPositionIncrement(posinc); typeAtt.setType(type); + first = false; return true; } @@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer { super.reset(); tokens = null; scanner.reset(); + first = true; } @Override diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java index fe14521e918..a1cdd5f42ed 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java @@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.util.CharArraySet; @@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase { System.out.println(s); } } + + // stupid filter that inserts synonym of 'hte' for 'the' + private class MockSynonymFilter extends TokenFilter { + State bufferedState; + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + MockSynonymFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (bufferedState != null) { + restoreState(bufferedState); + posIncAtt.setPositionIncrement(0); + termAtt.setEmpty().append("hte"); + bufferedState = null; + return true; + } else if (input.incrementToken()) { + if (termAtt.toString().equals("the")) { + bufferedState = captureState(); + } + return true; + } else { + return false; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + bufferedState = null; + } + } + + public void testFirstPosInc() throws Exception { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenFilter filter = new MockSynonymFilter(tokenizer); + StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + stopfilter.setEnablePositionIncrements(false); + return new TokenStreamComponents(tokenizer, stopfilter); + } + }; + + assertAnalyzesTo(analyzer, "the quick brown fox", + new String[] { "hte", "quick", "brown", "fox" }, + new int[] { 1, 1, 1, 1} ); + } } diff --git a/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java index 740ad33b17f..22da7fad1fb 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java @@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase { assertTokenizesTo(map, tokens("a,5"), new String[] { "aa" }, new int[] { 5 }); - assertTokenizesTo(map, tokens("a,0"), - new String[] { "aa" }, - new int[] { 0 }); + assertTokenizesTo(map, tokens("b,1 a,0"), + new String[] { "b", "aa" }, + new int[] { 1, 0 }); // test that offset of first replacement is ignored (always takes the orig offset) map.add(strings("b"), tokens("bb,100"), orig, merge); assertTokenizesTo(map, tokens("b,5"), new String[] { "bb" }, new int[] { 5 }); - assertTokenizesTo(map, tokens("b,0"), - new String[] { "bb" }, - new int[] { 0 }); + assertTokenizesTo(map, tokens("c,1 b,0"), + new String[] { "c", "bb" }, + new int[] { 1, 0 }); // test that subsequent tokens are adjusted accordingly map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); assertTokenizesTo(map, tokens("c,5"), new String[] { "cc", "c2" }, new int[] { 5, 2 }); - assertTokenizesTo(map, tokens("c,0"), - new String[] { "cc", "c2" }, - new int[] { 0, 2 }); + assertTokenizesTo(map, tokens("d,1 c,0"), + new String[] { "d", "cc", "c2" }, + new int[] { 1, 0, 2 }); } @@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase { assertTokenizesTo(map, tokens("a,5"), new String[] { "a", "aa" }, new int[] { 5, 0 }); - assertTokenizesTo(map, tokens("a,0"), - new String[] { "a", "aa" }, - new int[] { 0, 0 }); + assertTokenizesTo(map, tokens("b,1 a,0"), + new String[] { "b", "a", "aa" }, + new int[] { 1, 0, 0 }); // test that offset of first replacement is ignored (always takes the orig offset) map.add(strings("b"), tokens("bb,100"), orig, merge); assertTokenizesTo(map, tokens("b,5"), new String[] { "b", "bb" }, new int[] { 5, 0 }); - assertTokenizesTo(map, tokens("b,0"), - new String[] { "b", "bb" }, - new int[] { 0, 0 }); + assertTokenizesTo(map, tokens("c,1 b,0"), + new String[] { "c", "b", "bb" }, + new int[] { 1, 0, 0 }); // test that subsequent tokens are adjusted accordingly map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); assertTokenizesTo(map, tokens("c,5"), new String[] { "c", "cc", "c2" }, new int[] { 5, 0, 2 }); - assertTokenizesTo(map, tokens("c,0"), - new String[] { "c", "cc", "c2" }, - new int[] { 0, 0, 2 }); + assertTokenizesTo(map, tokens("d,1 c,0"), + new String[] { "d", "c", "cc", "c2" }, + new int[] { 1, 0, 0, 2 }); }