LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
2012-04-09 19:05:47 +00:00 · 2012-04-09 19:05:47 +00:00 · ad5c89b1b1
parent f6f8e38cfa
commit ad5c89b1b1
5 changed files with 147 additions and 7 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
      }
    }
-    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
    ts.end();
    if (finalOffset != null) {
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
        startPosData.startOffset = startOffset;
      } else {
        // Make sure our input isn't messing up offsets:
-        assert startPosData.startOffset == startOffset;
+        assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
      }

      final int endOffset = offsetAtt.endOffset();
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
        endPosData.endOffset = endOffset;
      } else {
        // Make sure our input isn't messing up offsets:
-        assert endPosData.endOffset == endOffset;
+        assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
      }

      tokenPending = true;
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+// nocommit better name...?
+
+// nocommit BTSTC should just append this to the chain
+// instead of checking itself:
+
+/** A TokenFilter that checks consistency of the tokens (eg
+ *  offsets are consistent with one another). */
+public final class ValidatingTokenFilter extends TokenFilter {
+
+  private int pos;
+
+  // Maps position to the start/end offset:
+  private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+  private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
+  // nocommit must be more careful here?  check hasAttribute first...?
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  private final String name;
+
+  /** The name arg is used to identify this stage when
+   *  throwing exceptions (useful if you have more than one
+   *  instance in your chain). */
+  public ValidatingTokenFilter(TokenStream in, String name) {
+    super(in);
+    this.name = name;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    pos += posIncAtt.getPositionIncrement();
+    if (pos == -1) {
+      throw new IllegalStateException("first posInc must be > 0");
+    }
+
+    final int startOffset = offsetAtt.startOffset();
+    final int endOffset = offsetAtt.endOffset();
+
+    final int posLen = posLenAtt.getPositionLength();
+    if (!posToStartOffset.containsKey(pos)) {
+      // First time we've seen a token leaving from this position:
+      posToStartOffset.put(pos, startOffset);
+      System.out.println("  + s " + pos + " -> " + startOffset);
+    } else {
+      // We've seen a token leaving from this position
+      // before; verify the startOffset is the same:
+      System.out.println("  + vs " + pos + " -> " + startOffset);
+      final int oldStartOffset = posToStartOffset.get(pos);
+      if (oldStartOffset != startOffset) {
+        throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+      }
+    }
+
+    final int endPos = pos + posLen;
+
+    if (!posToEndOffset.containsKey(endPos)) {
+      // First time we've seen a token arriving to this position:
+      posToEndOffset.put(endPos, endOffset);
+      System.out.println("  + e " + endPos + " -> " + endOffset);
+    } else {
+      // We've seen a token arriving to this position
+      // before; verify the endOffset is the same:
+      System.out.println("  + ve " + endPos + " -> " + endOffset);
+      final int oldEndOffset = posToEndOffset.get(endPos);
+      if (oldEndOffset != endOffset) {
+        throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+      }
+    }
+
+    return true;
+  }
+
+  // TODO: end?  (what to validate?)
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    pos = -1;
+    posToStartOffset.clear();
+    posToEndOffset.clear();
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -23,9 +23,10 @@ import java.util.LinkedList;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;

@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);


@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
          noShingleOutput = false;
        }
        offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+        // nocommit is this right!?  i'm just guessing...
+        posLenAtt.setPositionLength(builtGramSize);
        isOutputHere = true;
        gramSize.advance();
        tokenAvailable = true;
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@ -34,11 +34,11 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
-import java.util.Map;
-import java.util.IdentityHashMap;
 import java.util.regex.Pattern;

 import org.apache.lucene.analysis.Analyzer;
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
      ) {
        continue;
      }
+
+      if (c == ValidatingTokenFilter.class) {
+        // We insert this one ourselves after each stage...
+        continue;
+      }
+
      for (final Constructor<?> ctor : c.getConstructors()) {
        // don't test deprecated ctors, they likely have known bugs:
        if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
      StringBuilder descr = new StringBuilder();
      int numFilters = random.nextInt(5);
      for (int i = 0; i < numFilters; i++) {
+
+        // Insert ValidatingTF after each stage so we can
+        // catch problems right after the TF that "caused"
+        // them:
+        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+
        while (true) {
          final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
          final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
          }
        }
      }
+
+      // Insert ValidatingTF after each stage so we can
+      // catch problems right after the TF that "caused"
+      // them:
+      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+
      spec.toString = descr.toString();
      return spec;
    }