diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index d8fbd15d328..d0f4b2b81d9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } - assertFalse("TokenStream has more tokens than expected", ts.incrementToken()); + assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken()); ts.end(); if (finalOffset != null) { assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java index 298ab96fe8f..9515ae94004 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java @@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter posToStartOffset = new HashMap(); + private final Map posToEndOffset = new HashMap(); + + // nocommit must be more careful here? check hasAttribute first...? + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + private final String name; + + /** The name arg is used to identify this stage when + * throwing exceptions (useful if you have more than one + * instance in your chain). */ + public ValidatingTokenFilter(TokenStream in, String name) { + super(in); + this.name = name; + } + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } + + pos += posIncAtt.getPositionIncrement(); + if (pos == -1) { + throw new IllegalStateException("first posInc must be > 0"); + } + + final int startOffset = offsetAtt.startOffset(); + final int endOffset = offsetAtt.endOffset(); + + final int posLen = posLenAtt.getPositionLength(); + if (!posToStartOffset.containsKey(pos)) { + // First time we've seen a token leaving from this position: + posToStartOffset.put(pos, startOffset); + System.out.println(" + s " + pos + " -> " + startOffset); + } else { + // We've seen a token leaving from this position + // before; verify the startOffset is the same: + System.out.println(" + vs " + pos + " -> " + startOffset); + final int oldStartOffset = posToStartOffset.get(pos); + if (oldStartOffset != startOffset) { + throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt); + } + } + + final int endPos = pos + posLen; + + if (!posToEndOffset.containsKey(endPos)) { + // First time we've seen a token arriving to this position: + posToEndOffset.put(endPos, endOffset); + System.out.println(" + e " + endPos + " -> " + endOffset); + } else { + // We've seen a token arriving to this position + // before; verify the endOffset is the same: + System.out.println(" + ve " + endPos + " -> " + endOffset); + final int oldEndOffset = posToEndOffset.get(endPos); + if (oldEndOffset != endOffset) { + throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt); + } + } + + return true; + } + + // TODO: end? (what to validate?) + + @Override + public void reset() throws IOException { + super.reset(); + pos = -1; + posToStartOffset.clear(); + posToEndOffset.clear(); + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 464bde05bcc..8ff920a4600 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -23,9 +23,10 @@ import java.util.LinkedList; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; @@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); @@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter { noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); + // nocommit is this right!? i'm just guessing... + posLenAtt.setPositionLength(builtGramSize); isOutputHere = true; gramSize.advance(); tokenAvailable = true; diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 482c1bc864e..477e0bc16cd 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -34,11 +34,11 @@ import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.HashSet; +import java.util.IdentityHashMap; import java.util.List; +import java.util.Map; import java.util.Random; import java.util.Set; -import java.util.Map; -import java.util.IdentityHashMap; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; @@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ValidatingTokenFilter; import org.apache.lucene.analysis.charfilter.CharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; @@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource.AttributeFactory; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Version; @@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { ) { continue; } + + if (c == ValidatingTokenFilter.class) { + // We insert this one ourselves after each stage... + continue; + } + for (final Constructor ctor : c.getConstructors()) { // don't test deprecated ctors, they likely have known bugs: if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) { @@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(5); for (int i = 0; i < numFilters; i++) { + + // Insert ValidatingTF after each stage so we can + // catch problems right after the TF that "caused" + // them: + spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i); + while (true) { final Constructor ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); @@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } } } + + // Insert ValidatingTF after each stage so we can + // catch problems right after the TF that "caused" + // them: + spec.stream = new ValidatingTokenFilter(spec.stream, "last stage"); + spec.toString = descr.toString(); return spec; }