LUCENE-8191: if a tokenstream has broken offsets, its broken. IndexWriter always checks, so a separate whitelist can't work

This commit is contained in:
Robert Muir 2018-03-04 11:23:45 -05:00
parent b83d203372
commit 97299ed006
2 changed files with 22 additions and 70 deletions

View File

@ -56,8 +56,6 @@ import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.CrankyTokenFilter;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
@ -147,15 +145,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
});
for (Class<?> c : Arrays.<Class<?>>asList(
// TODO: can we promote some of these to be only
// offsets offenders?
// doesn't actual reset itself!
CachingTokenFilter.class,
// LUCENE-8092: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
CommonGramsQueryFilter.class,
// Not broken, simulates brokenness:
CrankyTokenFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
// broken offsets
PathHierarchyTokenizer.class,
// broken offsets
ReversePathHierarchyTokenizer.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
ValidatingTokenFilter.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
WordDelimiterFilter.class,
// Cannot correct offsets when a char filter had changed them:
@ -174,33 +184,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
// TODO: also fix these and remove (maybe):
// Classes/options that don't produce consistent graph offsets:
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>();
static {
try {
for (Class<?> c : Arrays.<Class<?>>asList(
ReversePathHierarchyTokenizer.class,
PathHierarchyTokenizer.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
CommonGramsQueryFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenOffsetsConstructors.put(ctor, ALWAYS);
}
}
} catch (Exception e) {
throw new Error(e);
}
}
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
@ -584,20 +567,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
this.seed = seed;
}
public boolean offsetsAreCorrect() {
// TODO: can we not do the full chain here!?
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
return filterSpec.offsetsAreCorrect;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
}
@ -622,13 +597,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
sb.append("\n");
sb.append("tokenizer=");
sb.append(tokenizerSpec.toString);
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
sb.append("\n");
sb.append("filters=");
sb.append(tokenFilterSpec.toString);
sb.append("\n");
sb.append("offsetsAreCorrect=");
sb.append(tokenFilterSpec.offsetsAreCorrect);
return sb.toString();
}
@ -669,11 +641,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return pred != null && pred.test(args);
}
private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
return pred != null && pred.test(args);
}
// create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random) {
TokenizerSpec spec = new TokenizerSpec();
@ -686,7 +653,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
spec.tokenizer = createComponent(ctor, args, descr);
if (spec.tokenizer != null) {
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.toString = descr.toString();
}
}
@ -716,9 +682,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return spec;
}
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
TokenFilterSpec spec = new TokenFilterSpec();
spec.offsetsAreCorrect = offsetsAreCorrect;
spec.stream = tokenizer;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
@ -727,26 +692,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
// hack: MockGraph/MockLookahead has assertions that will trip if they follow
// an offsets violator. so we cant use them after e.g. wikipediatokenizer
if (!spec.offsetsAreCorrect &&
(ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
|| ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
continue;
}
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
final TokenFilter flt = createComponent(ctor, args, descr);
if (flt != null) {
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.stream = flt;
break;
}
@ -756,7 +712,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
spec.toString = descr.toString();
return spec;
@ -829,13 +785,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
static class TokenizerSpec {
Tokenizer tokenizer;
String toString;
boolean offsetsAreCorrect = true;
}
static class TokenFilterSpec {
TokenStream stream;
String toString;
boolean offsetsAreCorrect = true;
}
static class CharFilterSpec {

View File

@ -49,17 +49,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class);
private final boolean offsetsAreCorrect;
private final String name;
/** The name arg is used to identify this stage when
* throwing exceptions (useful if you have more than one
* instance in your chain). */
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
public ValidatingTokenFilter(TokenStream in, String name) {
super(in);
this.name = name;
this.offsetsAreCorrect = offsetsAreCorrect;
}
@Override
@ -85,7 +83,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
startOffset = offsetAtt.startOffset();
endOffset = offsetAtt.endOffset();
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
if (offsetAtt.startOffset() < lastStartOffset) {
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
}
lastStartOffset = offsetAtt.startOffset();
@ -93,7 +91,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
if (offsetAtt != null && posIncAtt != null) {
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position: