LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-04-09 19:05:47 +00:00
parent f6f8e38cfa
commit ad5c89b1b1
5 changed files with 147 additions and 7 deletions

View File

@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
} }
} }
assertFalse("TokenStream has more tokens than expected", ts.incrementToken()); assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
ts.end(); ts.end();
if (finalOffset != null) { if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());

View File

@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
startPosData.startOffset = startOffset; startPosData.startOffset = startOffset;
} else { } else {
// Make sure our input isn't messing up offsets: // Make sure our input isn't messing up offsets:
assert startPosData.startOffset == startOffset; assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
} }
final int endOffset = offsetAtt.endOffset(); final int endOffset = offsetAtt.endOffset();
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
endPosData.endOffset = endOffset; endPosData.endOffset = endOffset;
} else { } else {
// Make sure our input isn't messing up offsets: // Make sure our input isn't messing up offsets:
assert endPosData.endOffset == endOffset; assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
} }
tokenPending = true; tokenPending = true;

View File

@ -0,0 +1,117 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
// nocommit better name...?
// nocommit BTSTC should just append this to the chain
// instead of checking itself:
/** A TokenFilter that checks consistency of the tokens (eg
* offsets are consistent with one another). */
public final class ValidatingTokenFilter extends TokenFilter {
private int pos;
// Maps position to the start/end offset:
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
// nocommit must be more careful here? check hasAttribute first...?
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final String name;
/** The name arg is used to identify this stage when
* throwing exceptions (useful if you have more than one
* instance in your chain). */
public ValidatingTokenFilter(TokenStream in, String name) {
super(in);
this.name = name;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
pos += posIncAtt.getPositionIncrement();
if (pos == -1) {
throw new IllegalStateException("first posInc must be > 0");
}
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
final int posLen = posLenAtt.getPositionLength();
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position:
posToStartOffset.put(pos, startOffset);
System.out.println(" + s " + pos + " -> " + startOffset);
} else {
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
System.out.println(" + vs " + pos + " -> " + startOffset);
final int oldStartOffset = posToStartOffset.get(pos);
if (oldStartOffset != startOffset) {
throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
}
}
final int endPos = pos + posLen;
if (!posToEndOffset.containsKey(endPos)) {
// First time we've seen a token arriving to this position:
posToEndOffset.put(endPos, endOffset);
System.out.println(" + e " + endPos + " -> " + endOffset);
} else {
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
System.out.println(" + ve " + endPos + " -> " + endOffset);
final int oldEndOffset = posToEndOffset.get(endPos);
if (oldEndOffset != endOffset) {
throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
}
}
return true;
}
// TODO: end? (what to validate?)
@Override
public void reset() throws IOException {
super.reset();
pos = -1;
posToStartOffset.clear();
posToEndOffset.clear();
}
}

View File

@ -23,9 +23,10 @@ import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
noShingleOutput = false; noShingleOutput = false;
} }
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
// nocommit is this right!? i'm just guessing...
posLenAtt.setPositionLength(builtGramSize);
isOutputHere = true; isOutputHere = true;
gramSize.advance(); gramSize.advance();
tokenAvailable = true; tokenAvailable = true;

View File

@ -34,11 +34,11 @@ import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.Map;
import java.util.IdentityHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.charfilter.CharFilter; import org.apache.lucene.analysis.charfilter.CharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory; import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
) { ) {
continue; continue;
} }
if (c == ValidatingTokenFilter.class) {
// We insert this one ourselves after each stage...
continue;
}
for (final Constructor<?> ctor : c.getConstructors()) { for (final Constructor<?> ctor : c.getConstructors()) {
// don't test deprecated ctors, they likely have known bugs: // don't test deprecated ctors, they likely have known bugs:
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) { if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
StringBuilder descr = new StringBuilder(); StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5); int numFilters = random.nextInt(5);
for (int i = 0; i < numFilters; i++) { for (int i = 0; i < numFilters; i++) {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
while (true) { while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
} }
} }
} }
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
spec.toString = descr.toString(); spec.toString = descr.toString();
return spec; return spec;
} }