LUCENE-8191: if a tokenstream has broken offsets, its broken. IndexWriter always checks, so a separate whitelist can't work

This commit is contained in:
Robert Muir 2018-03-04 11:23:45 -05:00
parent b83d203372
commit 97299ed006
2 changed files with 22 additions and 70 deletions

View File

@ -56,8 +56,6 @@ import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.CrankyTokenFilter; import org.apache.lucene.analysis.CrankyTokenFilter;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
@ -147,15 +145,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
}); });
for (Class<?> c : Arrays.<Class<?>>asList( for (Class<?> c : Arrays.<Class<?>>asList(
// TODO: can we promote some of these to be only
// offsets offenders?
// doesn't actual reset itself! // doesn't actual reset itself!
CachingTokenFilter.class, CachingTokenFilter.class,
// LUCENE-8092: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
CommonGramsQueryFilter.class,
// Not broken, simulates brokenness: // Not broken, simulates brokenness:
CrankyTokenFilter.class, CrankyTokenFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
// broken offsets
PathHierarchyTokenizer.class,
// broken offsets
ReversePathHierarchyTokenizer.class,
// Not broken: we forcefully add this, so we shouldn't // Not broken: we forcefully add this, so we shouldn't
// also randomly pick it: // also randomly pick it:
ValidatingTokenFilter.class, ValidatingTokenFilter.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain) // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
WordDelimiterFilter.class, WordDelimiterFilter.class,
// Cannot correct offsets when a char filter had changed them: // Cannot correct offsets when a char filter had changed them:
@ -174,33 +184,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
} }
} }
// TODO: also fix these and remove (maybe):
// Classes/options that don't produce consistent graph offsets:
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>();
static {
try {
for (Class<?> c : Arrays.<Class<?>>asList(
ReversePathHierarchyTokenizer.class,
PathHierarchyTokenizer.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
CommonGramsQueryFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenOffsetsConstructors.put(ctor, ALWAYS);
}
}
} catch (Exception e) {
throw new Error(e);
}
}
@BeforeClass @BeforeClass
public static void beforeClass() throws Exception { public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
@ -584,20 +567,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
this.seed = seed; this.seed = seed;
} }
public boolean offsetsAreCorrect() {
// TODO: can we not do the full chain here!?
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
return filterSpec.offsetsAreCorrect;
}
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Random random = new Random(seed); Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random); TokenizerSpec tokenizerSpec = newTokenizer(random);
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); //System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); //System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
} }
@ -622,13 +597,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
sb.append("\n"); sb.append("\n");
sb.append("tokenizer="); sb.append("tokenizer=");
sb.append(tokenizerSpec.toString); sb.append(tokenizerSpec.toString);
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
sb.append("\n"); sb.append("\n");
sb.append("filters="); sb.append("filters=");
sb.append(tokenFilterSpec.toString); sb.append(tokenFilterSpec.toString);
sb.append("\n");
sb.append("offsetsAreCorrect=");
sb.append(tokenFilterSpec.offsetsAreCorrect);
return sb.toString(); return sb.toString();
} }
@ -669,11 +641,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return pred != null && pred.test(args); return pred != null && pred.test(args);
} }
private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
return pred != null && pred.test(args);
}
// create a new random tokenizer from classpath // create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random) { private TokenizerSpec newTokenizer(Random random) {
TokenizerSpec spec = new TokenizerSpec(); TokenizerSpec spec = new TokenizerSpec();
@ -686,7 +653,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
} }
spec.tokenizer = createComponent(ctor, args, descr); spec.tokenizer = createComponent(ctor, args, descr);
if (spec.tokenizer != null) { if (spec.tokenizer != null) {
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.toString = descr.toString(); spec.toString = descr.toString();
} }
} }
@ -716,9 +682,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return spec; return spec;
} }
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) { private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
TokenFilterSpec spec = new TokenFilterSpec(); TokenFilterSpec spec = new TokenFilterSpec();
spec.offsetsAreCorrect = offsetsAreCorrect;
spec.stream = tokenizer; spec.stream = tokenizer;
StringBuilder descr = new StringBuilder(); StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5); int numFilters = random.nextInt(5);
@ -727,26 +692,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can // Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused" // catch problems right after the TF that "caused"
// them: // them:
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect); spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
while (true) { while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
// hack: MockGraph/MockLookahead has assertions that will trip if they follow
// an offsets violator. so we cant use them after e.g. wikipediatokenizer
if (!spec.offsetsAreCorrect &&
(ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
|| ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
continue;
}
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
if (broken(ctor, args)) { if (broken(ctor, args)) {
continue; continue;
} }
final TokenFilter flt = createComponent(ctor, args, descr); final TokenFilter flt = createComponent(ctor, args, descr);
if (flt != null) { if (flt != null) {
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.stream = flt; spec.stream = flt;
break; break;
} }
@ -756,7 +712,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can // Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused" // catch problems right after the TF that "caused"
// them: // them:
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect); spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
spec.toString = descr.toString(); spec.toString = descr.toString();
return spec; return spec;
@ -829,13 +785,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
static class TokenizerSpec { static class TokenizerSpec {
Tokenizer tokenizer; Tokenizer tokenizer;
String toString; String toString;
boolean offsetsAreCorrect = true;
} }
static class TokenFilterSpec { static class TokenFilterSpec {
TokenStream stream; TokenStream stream;
String toString; String toString;
boolean offsetsAreCorrect = true;
} }
static class CharFilterSpec { static class CharFilterSpec {

View File

@ -49,17 +49,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class); private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class);
private final boolean offsetsAreCorrect;
private final String name; private final String name;
/** The name arg is used to identify this stage when /** The name arg is used to identify this stage when
* throwing exceptions (useful if you have more than one * throwing exceptions (useful if you have more than one
* instance in your chain). */ * instance in your chain). */
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) { public ValidatingTokenFilter(TokenStream in, String name) {
super(in); super(in);
this.name = name; this.name = name;
this.offsetsAreCorrect = offsetsAreCorrect;
} }
@Override @Override
@ -85,7 +83,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
startOffset = offsetAtt.startOffset(); startOffset = offsetAtt.startOffset();
endOffset = offsetAtt.endOffset(); endOffset = offsetAtt.endOffset();
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) { if (offsetAtt.startOffset() < lastStartOffset) {
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
} }
lastStartOffset = offsetAtt.startOffset(); lastStartOffset = offsetAtt.startOffset();
@ -93,7 +91,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength(); posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) { if (offsetAtt != null && posIncAtt != null) {
if (!posToStartOffset.containsKey(pos)) { if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position: // First time we've seen a token leaving from this position: