From 97299ed00699c248fc38465ee1b0eb0bb1561d3d Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 4 Mar 2018 11:23:45 -0500 Subject: [PATCH] LUCENE-8191: if a tokenstream has broken offsets, its broken. IndexWriter always checks, so a separate whitelist can't work --- .../analysis/core/TestRandomChains.java | 84 +++++-------------- .../analysis/ValidatingTokenFilter.java | 8 +- 2 files changed, 22 insertions(+), 70 deletions(-) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 406addf4bbc..78392033d25 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -56,8 +56,6 @@ import org.apache.lucene.analysis.CharArrayMap; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.CrankyTokenFilter; -import org.apache.lucene.analysis.MockGraphTokenFilter; -import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; @@ -147,15 +145,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); for (Class c : Arrays.>asList( - // TODO: can we promote some of these to be only - // offsets offenders? // doesn't actual reset itself! CachingTokenFilter.class, + // LUCENE-8092: doesn't handle graph inputs + CJKBigramFilter.class, + // TODO: LUCENE-4983 + CommonGramsFilter.class, + // TODO: doesn't handle graph inputs + CommonGramsQueryFilter.class, // Not broken, simulates brokenness: CrankyTokenFilter.class, + // TODO: doesn't handle graph inputs (or even look at positionIncrement) + HyphenatedWordsFilter.class, + // broken offsets + PathHierarchyTokenizer.class, + // broken offsets + ReversePathHierarchyTokenizer.class, // Not broken: we forcefully add this, so we shouldn't // also randomly pick it: ValidatingTokenFilter.class, + // TODO: it seems to mess up offsets!? + WikipediaTokenizer.class, // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain) WordDelimiterFilter.class, // Cannot correct offsets when a char filter had changed them: @@ -174,33 +184,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } } - // TODO: also fix these and remove (maybe): - // Classes/options that don't produce consistent graph offsets: - private static final Map,Predicate> brokenOffsetsConstructors = new HashMap<>(); - static { - try { - for (Class c : Arrays.>asList( - ReversePathHierarchyTokenizer.class, - PathHierarchyTokenizer.class, - // TODO: it seems to mess up offsets!? - WikipediaTokenizer.class, - // TODO: doesn't handle graph inputs - CJKBigramFilter.class, - // TODO: doesn't handle graph inputs (or even look at positionIncrement) - HyphenatedWordsFilter.class, - // TODO: LUCENE-4983 - CommonGramsFilter.class, - // TODO: doesn't handle graph inputs - CommonGramsQueryFilter.class)) { - for (Constructor ctor : c.getConstructors()) { - brokenOffsetsConstructors.put(ctor, ALWAYS); - } - } - } catch (Exception e) { - throw new Error(e); - } - } - @BeforeClass public static void beforeClass() throws Exception { List> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); @@ -584,20 +567,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { this.seed = seed; } - public boolean offsetsAreCorrect() { - // TODO: can we not do the full chain here!? - Random random = new Random(seed); - TokenizerSpec tokenizerSpec = newTokenizer(random); - TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); - return filterSpec.offsetsAreCorrect; - } - @Override protected TokenStreamComponents createComponents(String fieldName) { Random random = new Random(seed); TokenizerSpec tokenizerSpec = newTokenizer(random); //System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); - TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); + TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer); //System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); } @@ -622,13 +597,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase { sb.append("\n"); sb.append("tokenizer="); sb.append(tokenizerSpec.toString); - TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); + TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer); sb.append("\n"); sb.append("filters="); sb.append(tokenFilterSpec.toString); - sb.append("\n"); - sb.append("offsetsAreCorrect="); - sb.append(tokenFilterSpec.offsetsAreCorrect); return sb.toString(); } @@ -669,11 +641,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return pred != null && pred.test(args); } - private boolean brokenOffsets(Constructor ctor, Object[] args) { - final Predicate pred = brokenOffsetsConstructors.get(ctor); - return pred != null && pred.test(args); - } - // create a new random tokenizer from classpath private TokenizerSpec newTokenizer(Random random) { TokenizerSpec spec = new TokenizerSpec(); @@ -686,7 +653,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } spec.tokenizer = createComponent(ctor, args, descr); if (spec.tokenizer != null) { - spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.toString = descr.toString(); } } @@ -716,9 +682,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return spec; } - private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) { + private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) { TokenFilterSpec spec = new TokenFilterSpec(); - spec.offsetsAreCorrect = offsetsAreCorrect; spec.stream = tokenizer; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(5); @@ -727,26 +692,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase { // Insert ValidatingTF after each stage so we can // catch problems right after the TF that "caused" // them: - spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect); + spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i); while (true) { final Constructor ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); - // hack: MockGraph/MockLookahead has assertions that will trip if they follow - // an offsets violator. so we cant use them after e.g. wikipediatokenizer - if (!spec.offsetsAreCorrect && - (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class) - || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) { - continue; - } - final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); if (broken(ctor, args)) { continue; } final TokenFilter flt = createComponent(ctor, args, descr); if (flt != null) { - spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.stream = flt; break; } @@ -756,7 +712,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { // Insert ValidatingTF after each stage so we can // catch problems right after the TF that "caused" // them: - spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect); + spec.stream = new ValidatingTokenFilter(spec.stream, "last stage"); spec.toString = descr.toString(); return spec; @@ -829,13 +785,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase { static class TokenizerSpec { Tokenizer tokenizer; String toString; - boolean offsetsAreCorrect = true; } static class TokenFilterSpec { TokenStream stream; String toString; - boolean offsetsAreCorrect = true; } static class CharFilterSpec { diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java index b31b604fb75..7901eea5bf2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java @@ -49,17 +49,15 @@ public final class ValidatingTokenFilter extends TokenFilter { private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class); private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class); - private final boolean offsetsAreCorrect; private final String name; /** The name arg is used to identify this stage when * throwing exceptions (useful if you have more than one * instance in your chain). */ - public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) { + public ValidatingTokenFilter(TokenStream in, String name) { super(in); this.name = name; - this.offsetsAreCorrect = offsetsAreCorrect; } @Override @@ -85,7 +83,7 @@ public final class ValidatingTokenFilter extends TokenFilter { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); - if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) { + if (offsetAtt.startOffset() < lastStartOffset) { throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); } lastStartOffset = offsetAtt.startOffset(); @@ -93,7 +91,7 @@ public final class ValidatingTokenFilter extends TokenFilter { posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength(); - if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) { + if (offsetAtt != null && posIncAtt != null) { if (!posToStartOffset.containsKey(pos)) { // First time we've seen a token leaving from this position: