mirror of https://github.com/apache/lucene.git
LUCENE-8191: if a tokenstream has broken offsets, its broken. IndexWriter always checks, so a separate whitelist can't work
This commit is contained in:
parent
b83d203372
commit
97299ed006
|
@ -56,8 +56,6 @@ import org.apache.lucene.analysis.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.CharFilter;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.CrankyTokenFilter;
|
import org.apache.lucene.analysis.CrankyTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
|
||||||
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
|
||||||
import org.apache.lucene.analysis.MockTokenFilter;
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
@ -147,15 +145,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
|
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
|
||||||
});
|
});
|
||||||
for (Class<?> c : Arrays.<Class<?>>asList(
|
for (Class<?> c : Arrays.<Class<?>>asList(
|
||||||
// TODO: can we promote some of these to be only
|
|
||||||
// offsets offenders?
|
|
||||||
// doesn't actual reset itself!
|
// doesn't actual reset itself!
|
||||||
CachingTokenFilter.class,
|
CachingTokenFilter.class,
|
||||||
|
// LUCENE-8092: doesn't handle graph inputs
|
||||||
|
CJKBigramFilter.class,
|
||||||
|
// TODO: LUCENE-4983
|
||||||
|
CommonGramsFilter.class,
|
||||||
|
// TODO: doesn't handle graph inputs
|
||||||
|
CommonGramsQueryFilter.class,
|
||||||
// Not broken, simulates brokenness:
|
// Not broken, simulates brokenness:
|
||||||
CrankyTokenFilter.class,
|
CrankyTokenFilter.class,
|
||||||
|
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
|
||||||
|
HyphenatedWordsFilter.class,
|
||||||
|
// broken offsets
|
||||||
|
PathHierarchyTokenizer.class,
|
||||||
|
// broken offsets
|
||||||
|
ReversePathHierarchyTokenizer.class,
|
||||||
// Not broken: we forcefully add this, so we shouldn't
|
// Not broken: we forcefully add this, so we shouldn't
|
||||||
// also randomly pick it:
|
// also randomly pick it:
|
||||||
ValidatingTokenFilter.class,
|
ValidatingTokenFilter.class,
|
||||||
|
// TODO: it seems to mess up offsets!?
|
||||||
|
WikipediaTokenizer.class,
|
||||||
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
|
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
|
||||||
WordDelimiterFilter.class,
|
WordDelimiterFilter.class,
|
||||||
// Cannot correct offsets when a char filter had changed them:
|
// Cannot correct offsets when a char filter had changed them:
|
||||||
|
@ -174,33 +184,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: also fix these and remove (maybe):
|
|
||||||
// Classes/options that don't produce consistent graph offsets:
|
|
||||||
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>();
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
for (Class<?> c : Arrays.<Class<?>>asList(
|
|
||||||
ReversePathHierarchyTokenizer.class,
|
|
||||||
PathHierarchyTokenizer.class,
|
|
||||||
// TODO: it seems to mess up offsets!?
|
|
||||||
WikipediaTokenizer.class,
|
|
||||||
// TODO: doesn't handle graph inputs
|
|
||||||
CJKBigramFilter.class,
|
|
||||||
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
|
|
||||||
HyphenatedWordsFilter.class,
|
|
||||||
// TODO: LUCENE-4983
|
|
||||||
CommonGramsFilter.class,
|
|
||||||
// TODO: doesn't handle graph inputs
|
|
||||||
CommonGramsQueryFilter.class)) {
|
|
||||||
for (Constructor<?> ctor : c.getConstructors()) {
|
|
||||||
brokenOffsetsConstructors.put(ctor, ALWAYS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new Error(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void beforeClass() throws Exception {
|
public static void beforeClass() throws Exception {
|
||||||
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
|
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
|
||||||
|
@ -584,20 +567,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
this.seed = seed;
|
this.seed = seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean offsetsAreCorrect() {
|
|
||||||
// TODO: can we not do the full chain here!?
|
|
||||||
Random random = new Random(seed);
|
|
||||||
TokenizerSpec tokenizerSpec = newTokenizer(random);
|
|
||||||
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
|
||||||
return filterSpec.offsetsAreCorrect;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Random random = new Random(seed);
|
Random random = new Random(seed);
|
||||||
TokenizerSpec tokenizerSpec = newTokenizer(random);
|
TokenizerSpec tokenizerSpec = newTokenizer(random);
|
||||||
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
|
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
|
||||||
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
|
||||||
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
|
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
|
||||||
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
|
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
|
||||||
}
|
}
|
||||||
|
@ -622,13 +597,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
sb.append("tokenizer=");
|
sb.append("tokenizer=");
|
||||||
sb.append(tokenizerSpec.toString);
|
sb.append(tokenizerSpec.toString);
|
||||||
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
sb.append("filters=");
|
sb.append("filters=");
|
||||||
sb.append(tokenFilterSpec.toString);
|
sb.append(tokenFilterSpec.toString);
|
||||||
sb.append("\n");
|
|
||||||
sb.append("offsetsAreCorrect=");
|
|
||||||
sb.append(tokenFilterSpec.offsetsAreCorrect);
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -669,11 +641,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
return pred != null && pred.test(args);
|
return pred != null && pred.test(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
|
|
||||||
final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
|
|
||||||
return pred != null && pred.test(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
// create a new random tokenizer from classpath
|
// create a new random tokenizer from classpath
|
||||||
private TokenizerSpec newTokenizer(Random random) {
|
private TokenizerSpec newTokenizer(Random random) {
|
||||||
TokenizerSpec spec = new TokenizerSpec();
|
TokenizerSpec spec = new TokenizerSpec();
|
||||||
|
@ -686,7 +653,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
spec.tokenizer = createComponent(ctor, args, descr);
|
spec.tokenizer = createComponent(ctor, args, descr);
|
||||||
if (spec.tokenizer != null) {
|
if (spec.tokenizer != null) {
|
||||||
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
|
|
||||||
spec.toString = descr.toString();
|
spec.toString = descr.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -716,9 +682,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
return spec;
|
return spec;
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
|
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
|
||||||
TokenFilterSpec spec = new TokenFilterSpec();
|
TokenFilterSpec spec = new TokenFilterSpec();
|
||||||
spec.offsetsAreCorrect = offsetsAreCorrect;
|
|
||||||
spec.stream = tokenizer;
|
spec.stream = tokenizer;
|
||||||
StringBuilder descr = new StringBuilder();
|
StringBuilder descr = new StringBuilder();
|
||||||
int numFilters = random.nextInt(5);
|
int numFilters = random.nextInt(5);
|
||||||
|
@ -727,26 +692,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// Insert ValidatingTF after each stage so we can
|
// Insert ValidatingTF after each stage so we can
|
||||||
// catch problems right after the TF that "caused"
|
// catch problems right after the TF that "caused"
|
||||||
// them:
|
// them:
|
||||||
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
|
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
||||||
|
|
||||||
// hack: MockGraph/MockLookahead has assertions that will trip if they follow
|
|
||||||
// an offsets violator. so we cant use them after e.g. wikipediatokenizer
|
|
||||||
if (!spec.offsetsAreCorrect &&
|
|
||||||
(ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
|
|
||||||
|| ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||||
if (broken(ctor, args)) {
|
if (broken(ctor, args)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
final TokenFilter flt = createComponent(ctor, args, descr);
|
final TokenFilter flt = createComponent(ctor, args, descr);
|
||||||
if (flt != null) {
|
if (flt != null) {
|
||||||
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
|
|
||||||
spec.stream = flt;
|
spec.stream = flt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -756,7 +712,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// Insert ValidatingTF after each stage so we can
|
// Insert ValidatingTF after each stage so we can
|
||||||
// catch problems right after the TF that "caused"
|
// catch problems right after the TF that "caused"
|
||||||
// them:
|
// them:
|
||||||
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
|
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
|
||||||
|
|
||||||
spec.toString = descr.toString();
|
spec.toString = descr.toString();
|
||||||
return spec;
|
return spec;
|
||||||
|
@ -829,13 +785,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
static class TokenizerSpec {
|
static class TokenizerSpec {
|
||||||
Tokenizer tokenizer;
|
Tokenizer tokenizer;
|
||||||
String toString;
|
String toString;
|
||||||
boolean offsetsAreCorrect = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static class TokenFilterSpec {
|
static class TokenFilterSpec {
|
||||||
TokenStream stream;
|
TokenStream stream;
|
||||||
String toString;
|
String toString;
|
||||||
boolean offsetsAreCorrect = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static class CharFilterSpec {
|
static class CharFilterSpec {
|
||||||
|
|
|
@ -49,17 +49,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class);
|
private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class);
|
||||||
private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class);
|
||||||
private final boolean offsetsAreCorrect;
|
|
||||||
|
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
/** The name arg is used to identify this stage when
|
/** The name arg is used to identify this stage when
|
||||||
* throwing exceptions (useful if you have more than one
|
* throwing exceptions (useful if you have more than one
|
||||||
* instance in your chain). */
|
* instance in your chain). */
|
||||||
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
|
public ValidatingTokenFilter(TokenStream in, String name) {
|
||||||
super(in);
|
super(in);
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.offsetsAreCorrect = offsetsAreCorrect;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -85,7 +83,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
startOffset = offsetAtt.startOffset();
|
startOffset = offsetAtt.startOffset();
|
||||||
endOffset = offsetAtt.endOffset();
|
endOffset = offsetAtt.endOffset();
|
||||||
|
|
||||||
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
|
if (offsetAtt.startOffset() < lastStartOffset) {
|
||||||
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
|
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
|
||||||
}
|
}
|
||||||
lastStartOffset = offsetAtt.startOffset();
|
lastStartOffset = offsetAtt.startOffset();
|
||||||
|
@ -93,7 +91,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
|
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
|
||||||
|
|
||||||
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
|
if (offsetAtt != null && posIncAtt != null) {
|
||||||
|
|
||||||
if (!posToStartOffset.containsKey(pos)) {
|
if (!posToStartOffset.containsKey(pos)) {
|
||||||
// First time we've seen a token leaving from this position:
|
// First time we've seen a token leaving from this position:
|
||||||
|
|
Loading…
Reference in New Issue