mirror of https://github.com/apache/lucene.git
LUCENE-3848: don't produce tokenstreams that start with posinc=0
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301478 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f8375ded9
commit
3d2d144f92
|
@ -781,6 +781,12 @@ Changes in Runtime Behavior
|
|||
is multiplied into the norm, previously your boost would be
|
||||
silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
|
||||
|
||||
* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
|
||||
position increment of 0: which is out of bounds (overlapping with a
|
||||
non-existant previous term). Consumers such as IndexWriter and QueryParser
|
||||
still check for and silently correct this situation today, but at some point
|
||||
in the future they may throw an exception. (Mike McCandless, Robert Muir)
|
||||
|
||||
Security fixes
|
||||
|
||||
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
|
||||
|
|
|
@ -157,8 +157,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
if (posIncrAtt != null) {
|
||||
if (i == 0) {
|
||||
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
|
||||
} else {
|
||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||
}
|
||||
}
|
||||
if (posLengthAtt != null) {
|
||||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
|||
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||
private boolean first = true; // only used when not preserving gaps
|
||||
|
||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
||||
super(input);
|
||||
|
@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
while (input.incrementToken()) {
|
||||
if (accept()) {
|
||||
if (first) {
|
||||
// first token having posinc=0 is illegal.
|
||||
if (posIncrAtt.getPositionIncrement() == 0) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
first = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean)
|
||||
*/
|
||||
|
|
|
@ -122,6 +122,8 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
|
||||
private boolean first;
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||
* <code>input</code> to a newly created JFlex scanner.
|
||||
|
@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
//output the untokenized Token first
|
||||
collapseAndSaveTokens(tokenType, type);
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
|
||||
int posinc = scanner.getPositionIncrement();
|
||||
if (first && posinc == 0) {
|
||||
posinc = 1; // don't emit posinc=0 for the first token!
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(posinc);
|
||||
typeAtt.setType(type);
|
||||
first = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
super.reset();
|
||||
tokens = null;
|
||||
scanner.reset();
|
||||
first = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
System.out.println(s);
|
||||
}
|
||||
}
|
||||
|
||||
// stupid filter that inserts synonym of 'hte' for 'the'
|
||||
private class MockSynonymFilter extends TokenFilter {
|
||||
State bufferedState;
|
||||
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
MockSynonymFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (bufferedState != null) {
|
||||
restoreState(bufferedState);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
termAtt.setEmpty().append("hte");
|
||||
bufferedState = null;
|
||||
return true;
|
||||
} else if (input.incrementToken()) {
|
||||
if (termAtt.toString().equals("the")) {
|
||||
bufferedState = captureState();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferedState = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void testFirstPosInc() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new MockSynonymFilter(tokenizer);
|
||||
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
stopfilter.setEnablePositionIncrements(false);
|
||||
return new TokenStreamComponents(tokenizer, stopfilter);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "the quick brown fox",
|
||||
new String[] { "hte", "quick", "brown", "fox" },
|
||||
new int[] { 1, 1, 1, 1} );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenizesTo(map, tokens("a,5"),
|
||||
new String[] { "aa" },
|
||||
new int[] { 5 });
|
||||
assertTokenizesTo(map, tokens("a,0"),
|
||||
new String[] { "aa" },
|
||||
new int[] { 0 });
|
||||
assertTokenizesTo(map, tokens("b,1 a,0"),
|
||||
new String[] { "b", "aa" },
|
||||
new int[] { 1, 0 });
|
||||
|
||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||
assertTokenizesTo(map, tokens("b,5"),
|
||||
new String[] { "bb" },
|
||||
new int[] { 5 });
|
||||
assertTokenizesTo(map, tokens("b,0"),
|
||||
new String[] { "bb" },
|
||||
new int[] { 0 });
|
||||
assertTokenizesTo(map, tokens("c,1 b,0"),
|
||||
new String[] { "c", "bb" },
|
||||
new int[] { 1, 0 });
|
||||
|
||||
// test that subsequent tokens are adjusted accordingly
|
||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||
assertTokenizesTo(map, tokens("c,5"),
|
||||
new String[] { "cc", "c2" },
|
||||
new int[] { 5, 2 });
|
||||
assertTokenizesTo(map, tokens("c,0"),
|
||||
new String[] { "cc", "c2" },
|
||||
new int[] { 0, 2 });
|
||||
assertTokenizesTo(map, tokens("d,1 c,0"),
|
||||
new String[] { "d", "cc", "c2" },
|
||||
new int[] { 1, 0, 2 });
|
||||
}
|
||||
|
||||
|
||||
|
@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenizesTo(map, tokens("a,5"),
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 5, 0 });
|
||||
assertTokenizesTo(map, tokens("a,0"),
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 0, 0 });
|
||||
assertTokenizesTo(map, tokens("b,1 a,0"),
|
||||
new String[] { "b", "a", "aa" },
|
||||
new int[] { 1, 0, 0 });
|
||||
|
||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||
assertTokenizesTo(map, tokens("b,5"),
|
||||
new String[] { "b", "bb" },
|
||||
new int[] { 5, 0 });
|
||||
assertTokenizesTo(map, tokens("b,0"),
|
||||
new String[] { "b", "bb" },
|
||||
new int[] { 0, 0 });
|
||||
assertTokenizesTo(map, tokens("c,1 b,0"),
|
||||
new String[] { "c", "b", "bb" },
|
||||
new int[] { 1, 0, 0 });
|
||||
|
||||
// test that subsequent tokens are adjusted accordingly
|
||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||
assertTokenizesTo(map, tokens("c,5"),
|
||||
new String[] { "c", "cc", "c2" },
|
||||
new int[] { 5, 0, 2 });
|
||||
assertTokenizesTo(map, tokens("c,0"),
|
||||
new String[] { "c", "cc", "c2" },
|
||||
new int[] { 0, 0, 2 });
|
||||
assertTokenizesTo(map, tokens("d,1 c,0"),
|
||||
new String[] { "d", "c", "cc", "c2" },
|
||||
new int[] { 1, 0, 0, 2 });
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue