mirror of https://github.com/apache/lucene.git
LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f6f8e38cfa
commit
ad5c89b1b1
|
@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
}
|
||||
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
||||
assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
|
||||
ts.end();
|
||||
if (finalOffset != null) {
|
||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||
|
|
|
@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
startPosData.startOffset = startOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert startPosData.startOffset == startOffset;
|
||||
assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
|
||||
}
|
||||
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
|
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
endPosData.endOffset = endOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert endPosData.endOffset == endOffset;
|
||||
assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
|
||||
}
|
||||
|
||||
tokenPending = true;
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
|
||||
// nocommit better name...?
|
||||
|
||||
// nocommit BTSTC should just append this to the chain
|
||||
// instead of checking itself:
|
||||
|
||||
/** A TokenFilter that checks consistency of the tokens (eg
|
||||
* offsets are consistent with one another). */
|
||||
public final class ValidatingTokenFilter extends TokenFilter {
|
||||
|
||||
private int pos;
|
||||
|
||||
// Maps position to the start/end offset:
|
||||
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||
private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
|
||||
|
||||
// nocommit must be more careful here? check hasAttribute first...?
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
private final String name;
|
||||
|
||||
/** The name arg is used to identify this stage when
|
||||
* throwing exceptions (useful if you have more than one
|
||||
* instance in your chain). */
|
||||
public ValidatingTokenFilter(TokenStream in, String name) {
|
||||
super(in);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
pos += posIncAtt.getPositionIncrement();
|
||||
if (pos == -1) {
|
||||
throw new IllegalStateException("first posInc must be > 0");
|
||||
}
|
||||
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
|
||||
final int posLen = posLenAtt.getPositionLength();
|
||||
if (!posToStartOffset.containsKey(pos)) {
|
||||
// First time we've seen a token leaving from this position:
|
||||
posToStartOffset.put(pos, startOffset);
|
||||
System.out.println(" + s " + pos + " -> " + startOffset);
|
||||
} else {
|
||||
// We've seen a token leaving from this position
|
||||
// before; verify the startOffset is the same:
|
||||
System.out.println(" + vs " + pos + " -> " + startOffset);
|
||||
final int oldStartOffset = posToStartOffset.get(pos);
|
||||
if (oldStartOffset != startOffset) {
|
||||
throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
|
||||
}
|
||||
}
|
||||
|
||||
final int endPos = pos + posLen;
|
||||
|
||||
if (!posToEndOffset.containsKey(endPos)) {
|
||||
// First time we've seen a token arriving to this position:
|
||||
posToEndOffset.put(endPos, endOffset);
|
||||
System.out.println(" + e " + endPos + " -> " + endOffset);
|
||||
} else {
|
||||
// We've seen a token arriving to this position
|
||||
// before; verify the endOffset is the same:
|
||||
System.out.println(" + ve " + endPos + " -> " + endOffset);
|
||||
final int oldEndOffset = posToEndOffset.get(endPos);
|
||||
if (oldEndOffset != endOffset) {
|
||||
throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: end? (what to validate?)
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
pos = -1;
|
||||
posToStartOffset.clear();
|
||||
posToEndOffset.clear();
|
||||
}
|
||||
}
|
|
@ -23,9 +23,10 @@ import java.util.LinkedList;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
|
||||
|
@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
|
|||
noShingleOutput = false;
|
||||
}
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||
// nocommit is this right!? i'm just guessing...
|
||||
posLenAtt.setPositionLength(builtGramSize);
|
||||
isOutputHere = true;
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
|
|
|
@ -34,11 +34,11 @@ import java.util.Collections;
|
|||
import java.util.Comparator;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
|
@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Rethrow;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == ValidatingTokenFilter.class) {
|
||||
// We insert this one ourselves after each stage...
|
||||
continue;
|
||||
}
|
||||
|
||||
for (final Constructor<?> ctor : c.getConstructors()) {
|
||||
// don't test deprecated ctors, they likely have known bugs:
|
||||
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
|
||||
|
@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
StringBuilder descr = new StringBuilder();
|
||||
int numFilters = random.nextInt(5);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
|
||||
// Insert ValidatingTF after each stage so we can
|
||||
// catch problems right after the TF that "caused"
|
||||
// them:
|
||||
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
|
||||
|
||||
while (true) {
|
||||
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||
|
@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert ValidatingTF after each stage so we can
|
||||
// catch problems right after the TF that "caused"
|
||||
// them:
|
||||
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
|
||||
|
||||
spec.toString = descr.toString();
|
||||
return spec;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue