LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
2012-04-09 19:05:47 +00:00 · 2012-04-09 19:05:47 +00:00 · ad5c89b1b1
parent f6f8e38cfa
commit ad5c89b1b1
5 changed files with 147 additions and 7 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
      }
    }
-    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
    ts.end();
    if (finalOffset != null) {
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
        startPosData.startOffset = startOffset;
      } else {
        // Make sure our input isn't messing up offsets:
-        assert startPosData.startOffset == startOffset;
+        assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
      }
      final int endOffset = offsetAtt.endOffset();
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
        endPosData.endOffset = endOffset;
      } else {
        // Make sure our input isn't messing up offsets:
-        assert endPosData.endOffset == endOffset;
+        assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
      }
      tokenPending = true;
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@ -0,0 +1,117 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 // nocommit better name...?
 // nocommit BTSTC should just append this to the chain
 // instead of checking itself:
 /** A TokenFilter that checks consistency of the tokens (eg
 *  offsets are consistent with one another). */
 public final class ValidatingTokenFilter extends TokenFilter {
  private int pos;
  // Maps position to the start/end offset:
  private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
  private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
  // nocommit must be more careful here?  check hasAttribute first...?
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final String name;
  /** The name arg is used to identify this stage when
   *  throwing exceptions (useful if you have more than one
   *  instance in your chain). */
  public ValidatingTokenFilter(TokenStream in, String name) {
    super(in);
    this.name = name;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }
    pos += posIncAtt.getPositionIncrement();
    if (pos == -1) {
      throw new IllegalStateException("first posInc must be > 0");
    }
    final int startOffset = offsetAtt.startOffset();
    final int endOffset = offsetAtt.endOffset();
    final int posLen = posLenAtt.getPositionLength();
    if (!posToStartOffset.containsKey(pos)) {
      // First time we've seen a token leaving from this position:
      posToStartOffset.put(pos, startOffset);
      System.out.println("  + s " + pos + " -> " + startOffset);
    } else {
      // We've seen a token leaving from this position
      // before; verify the startOffset is the same:
      System.out.println("  + vs " + pos + " -> " + startOffset);
      final int oldStartOffset = posToStartOffset.get(pos);
      if (oldStartOffset != startOffset) {
        throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
      }
    }
    final int endPos = pos + posLen;
    if (!posToEndOffset.containsKey(endPos)) {
      // First time we've seen a token arriving to this position:
      posToEndOffset.put(endPos, endOffset);
      System.out.println("  + e " + endPos + " -> " + endOffset);
    } else {
      // We've seen a token arriving to this position
      // before; verify the endOffset is the same:
      System.out.println("  + ve " + endPos + " -> " + endOffset);
      final int oldEndOffset = posToEndOffset.get(endPos);
      if (oldEndOffset != endOffset) {
        throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
      }
    }
    return true;
  }
  // TODO: end?  (what to validate?)
  @Override
  public void reset() throws IOException {
    super.reset();
    pos = -1;
    posToStartOffset.clear();
    posToEndOffset.clear();
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -23,9 +23,10 @@ import java.util.LinkedList;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
          noShingleOutput = false;
        }
        offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
        // nocommit is this right!?  i'm just guessing...
        posLenAtt.setPositionLength(builtGramSize);
        isOutputHere = true;
        gramSize.advance();
        tokenAvailable = true;
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@ -34,11 +34,11 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 import java.util.Map;
 import java.util.IdentityHashMap;
 import java.util.regex.Pattern;
 import org.apache.lucene.analysis.Analyzer;
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
      ) {
        continue;
      }
      if (c == ValidatingTokenFilter.class) {
        // We insert this one ourselves after each stage...
        continue;
      }
      for (final Constructor<?> ctor : c.getConstructors()) {
        // don't test deprecated ctors, they likely have known bugs:
        if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
      StringBuilder descr = new StringBuilder();
      int numFilters = random.nextInt(5);
      for (int i = 0; i < numFilters; i++) {
        // Insert ValidatingTF after each stage so we can
        // catch problems right after the TF that "caused"
        // them:
        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
        while (true) {
          final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
          final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
          }
        }
      }
      // Insert ValidatingTF after each stage so we can
      // catch problems right after the TF that "caused"
      // them:
      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
      spec.toString = descr.toString();
      return spec;
    }