mirror of https://github.com/apache/lucene.git
LUCENE-3848: don't produce tokenstreams that start with posinc=0
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301478 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f8375ded9
commit
3d2d144f92
|
@ -781,6 +781,12 @@ Changes in Runtime Behavior
|
||||||
is multiplied into the norm, previously your boost would be
|
is multiplied into the norm, previously your boost would be
|
||||||
silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
|
silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
|
||||||
|
position increment of 0: which is out of bounds (overlapping with a
|
||||||
|
non-existant previous term). Consumers such as IndexWriter and QueryParser
|
||||||
|
still check for and silently correct this situation today, but at some point
|
||||||
|
in the future they may throw an exception. (Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
Security fixes
|
Security fixes
|
||||||
|
|
||||||
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
|
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
|
||||||
|
|
|
@ -157,8 +157,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (posIncrAtt != null) {
|
if (posIncrAtt != null) {
|
||||||
|
if (i == 0) {
|
||||||
|
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
|
||||||
|
} else {
|
||||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (posLengthAtt != null) {
|
if (posLengthAtt != null) {
|
||||||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||||
|
private boolean first = true; // only used when not preserving gaps
|
||||||
|
|
||||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
||||||
super(input);
|
super(input);
|
||||||
|
@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
} else {
|
} else {
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (accept()) {
|
if (accept()) {
|
||||||
|
if (first) {
|
||||||
|
// first token having posinc=0 is illegal.
|
||||||
|
if (posIncrAtt.getPositionIncrement() == 0) {
|
||||||
|
posIncrAtt.setPositionIncrement(1);
|
||||||
|
}
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
first = true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see #setEnablePositionIncrements(boolean)
|
* @see #setEnablePositionIncrements(boolean)
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -122,6 +122,8 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
|
private boolean first;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||||
* <code>input</code> to a newly created JFlex scanner.
|
* <code>input</code> to a newly created JFlex scanner.
|
||||||
|
@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
//output the untokenized Token first
|
//output the untokenized Token first
|
||||||
collapseAndSaveTokens(tokenType, type);
|
collapseAndSaveTokens(tokenType, type);
|
||||||
}
|
}
|
||||||
posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
|
int posinc = scanner.getPositionIncrement();
|
||||||
|
if (first && posinc == 0) {
|
||||||
|
posinc = 1; // don't emit posinc=0 for the first token!
|
||||||
|
}
|
||||||
|
posIncrAtt.setPositionIncrement(posinc);
|
||||||
typeAtt.setType(type);
|
typeAtt.setType(type);
|
||||||
|
first = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
super.reset();
|
super.reset();
|
||||||
tokens = null;
|
tokens = null;
|
||||||
scanner.reset();
|
scanner.reset();
|
||||||
|
first = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// stupid filter that inserts synonym of 'hte' for 'the'
|
||||||
|
private class MockSynonymFilter extends TokenFilter {
|
||||||
|
State bufferedState;
|
||||||
|
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
MockSynonymFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (bufferedState != null) {
|
||||||
|
restoreState(bufferedState);
|
||||||
|
posIncAtt.setPositionIncrement(0);
|
||||||
|
termAtt.setEmpty().append("hte");
|
||||||
|
bufferedState = null;
|
||||||
|
return true;
|
||||||
|
} else if (input.incrementToken()) {
|
||||||
|
if (termAtt.toString().equals("the")) {
|
||||||
|
bufferedState = captureState();
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
bufferedState = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFirstPosInc() throws Exception {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenFilter filter = new MockSynonymFilter(tokenizer);
|
||||||
|
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
|
stopfilter.setEnablePositionIncrements(false);
|
||||||
|
return new TokenStreamComponents(tokenizer, stopfilter);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "the quick brown fox",
|
||||||
|
new String[] { "hte", "quick", "brown", "fox" },
|
||||||
|
new int[] { 1, 1, 1, 1} );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenizesTo(map, tokens("a,5"),
|
assertTokenizesTo(map, tokens("a,5"),
|
||||||
new String[] { "aa" },
|
new String[] { "aa" },
|
||||||
new int[] { 5 });
|
new int[] { 5 });
|
||||||
assertTokenizesTo(map, tokens("a,0"),
|
assertTokenizesTo(map, tokens("b,1 a,0"),
|
||||||
new String[] { "aa" },
|
new String[] { "b", "aa" },
|
||||||
new int[] { 0 });
|
new int[] { 1, 0 });
|
||||||
|
|
||||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||||
assertTokenizesTo(map, tokens("b,5"),
|
assertTokenizesTo(map, tokens("b,5"),
|
||||||
new String[] { "bb" },
|
new String[] { "bb" },
|
||||||
new int[] { 5 });
|
new int[] { 5 });
|
||||||
assertTokenizesTo(map, tokens("b,0"),
|
assertTokenizesTo(map, tokens("c,1 b,0"),
|
||||||
new String[] { "bb" },
|
new String[] { "c", "bb" },
|
||||||
new int[] { 0 });
|
new int[] { 1, 0 });
|
||||||
|
|
||||||
// test that subsequent tokens are adjusted accordingly
|
// test that subsequent tokens are adjusted accordingly
|
||||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||||
assertTokenizesTo(map, tokens("c,5"),
|
assertTokenizesTo(map, tokens("c,5"),
|
||||||
new String[] { "cc", "c2" },
|
new String[] { "cc", "c2" },
|
||||||
new int[] { 5, 2 });
|
new int[] { 5, 2 });
|
||||||
assertTokenizesTo(map, tokens("c,0"),
|
assertTokenizesTo(map, tokens("d,1 c,0"),
|
||||||
new String[] { "cc", "c2" },
|
new String[] { "d", "cc", "c2" },
|
||||||
new int[] { 0, 2 });
|
new int[] { 1, 0, 2 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenizesTo(map, tokens("a,5"),
|
assertTokenizesTo(map, tokens("a,5"),
|
||||||
new String[] { "a", "aa" },
|
new String[] { "a", "aa" },
|
||||||
new int[] { 5, 0 });
|
new int[] { 5, 0 });
|
||||||
assertTokenizesTo(map, tokens("a,0"),
|
assertTokenizesTo(map, tokens("b,1 a,0"),
|
||||||
new String[] { "a", "aa" },
|
new String[] { "b", "a", "aa" },
|
||||||
new int[] { 0, 0 });
|
new int[] { 1, 0, 0 });
|
||||||
|
|
||||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||||
assertTokenizesTo(map, tokens("b,5"),
|
assertTokenizesTo(map, tokens("b,5"),
|
||||||
new String[] { "b", "bb" },
|
new String[] { "b", "bb" },
|
||||||
new int[] { 5, 0 });
|
new int[] { 5, 0 });
|
||||||
assertTokenizesTo(map, tokens("b,0"),
|
assertTokenizesTo(map, tokens("c,1 b,0"),
|
||||||
new String[] { "b", "bb" },
|
new String[] { "c", "b", "bb" },
|
||||||
new int[] { 0, 0 });
|
new int[] { 1, 0, 0 });
|
||||||
|
|
||||||
// test that subsequent tokens are adjusted accordingly
|
// test that subsequent tokens are adjusted accordingly
|
||||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||||
assertTokenizesTo(map, tokens("c,5"),
|
assertTokenizesTo(map, tokens("c,5"),
|
||||||
new String[] { "c", "cc", "c2" },
|
new String[] { "c", "cc", "c2" },
|
||||||
new int[] { 5, 0, 2 });
|
new int[] { 5, 0, 2 });
|
||||||
assertTokenizesTo(map, tokens("c,0"),
|
assertTokenizesTo(map, tokens("d,1 c,0"),
|
||||||
new String[] { "c", "cc", "c2" },
|
new String[] { "d", "c", "cc", "c2" },
|
||||||
new int[] { 0, 0, 2 });
|
new int[] { 1, 0, 0, 2 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue