LUCENE-3848: don't produce tokenstreams that start with posinc=0

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301478 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-03-16 13:06:30 +00:00
parent 4f8375ded9
commit 3d2d144f92
6 changed files with 108 additions and 20 deletions

View File

@ -781,6 +781,12 @@ Changes in Runtime Behavior
is multiplied into the norm, previously your boost would be
silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
position increment of 0: which is out of bounds (overlapping with a
non-existant previous term). Consumers such as IndexWriter and QueryParser
still check for and silently correct this situation today, but at some point
in the future they may throw an exception. (Mike McCandless, Robert Muir)
Security fixes
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:

View File

@ -157,7 +157,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
if (posIncrAtt != null) {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
if (i == 0) {
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
} else {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
}
}
if (posLengthAtt != null) {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);

View File

@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
private boolean first = true; // only used when not preserving gaps
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter {
} else {
while (input.incrementToken()) {
if (accept()) {
if (first) {
// first token having posinc=0 is illegal.
if (posIncrAtt.getPositionIncrement() == 0) {
posIncrAtt.setPositionIncrement(1);
}
first = false;
}
return true;
}
}
@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
return false;
}
@Override
public void reset() throws IOException {
super.reset();
first = true;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/

View File

@ -122,6 +122,8 @@ public final class WikipediaTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private boolean first;
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer {
//output the untokenized Token first
collapseAndSaveTokens(tokenType, type);
}
posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
int posinc = scanner.getPositionIncrement();
if (first && posinc == 0) {
posinc = 1; // don't emit posinc=0 for the first token!
}
posIncrAtt.setPositionIncrement(posinc);
typeAtt.setType(type);
first = false;
return true;
}
@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer {
super.reset();
tokens = null;
scanner.reset();
first = true;
}
@Override

View File

@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
System.out.println(s);
}
}
// stupid filter that inserts synonym of 'hte' for 'the'
private class MockSynonymFilter extends TokenFilter {
State bufferedState;
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
MockSynonymFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (bufferedState != null) {
restoreState(bufferedState);
posIncAtt.setPositionIncrement(0);
termAtt.setEmpty().append("hte");
bufferedState = null;
return true;
} else if (input.incrementToken()) {
if (termAtt.toString().equals("the")) {
bufferedState = captureState();
}
return true;
} else {
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
bufferedState = null;
}
}
public void testFirstPosInc() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new MockSynonymFilter(tokenizer);
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
stopfilter.setEnablePositionIncrements(false);
return new TokenStreamComponents(tokenizer, stopfilter);
}
};
assertAnalyzesTo(analyzer, "the quick brown fox",
new String[] { "hte", "quick", "brown", "fox" },
new int[] { 1, 1, 1, 1} );
}
}

View File

@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
assertTokenizesTo(map, tokens("a,5"),
new String[] { "aa" },
new int[] { 5 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "aa" },
new int[] { 0 });
assertTokenizesTo(map, tokens("b,1 a,0"),
new String[] { "b", "aa" },
new int[] { 1, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "bb" },
new int[] { 5 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "bb" },
new int[] { 0 });
assertTokenizesTo(map, tokens("c,1 b,0"),
new String[] { "c", "bb" },
new int[] { 1, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "cc", "c2" },
new int[] { 5, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "cc", "c2" },
new int[] { 0, 2 });
assertTokenizesTo(map, tokens("d,1 c,0"),
new String[] { "d", "cc", "c2" },
new int[] { 1, 0, 2 });
}
@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
assertTokenizesTo(map, tokens("a,5"),
new String[] { "a", "aa" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "a", "aa" },
new int[] { 0, 0 });
assertTokenizesTo(map, tokens("b,1 a,0"),
new String[] { "b", "a", "aa" },
new int[] { 1, 0, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "b", "bb" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "b", "bb" },
new int[] { 0, 0 });
assertTokenizesTo(map, tokens("c,1 b,0"),
new String[] { "c", "b", "bb" },
new int[] { 1, 0, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "c", "cc", "c2" },
new int[] { 5, 0, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "c", "cc", "c2" },
new int[] { 0, 0, 2 });
assertTokenizesTo(map, tokens("d,1 c,0"),
new String[] { "d", "c", "cc", "c2" },
new int[] { 1, 0, 0, 2 });
}