mirror of https://github.com/apache/lucene.git
LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f6f8e38cfa
commit
ad5c89b1b1
|
@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
if (finalOffset != null) {
|
if (finalOffset != null) {
|
||||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||||
|
|
|
@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
||||||
startPosData.startOffset = startOffset;
|
startPosData.startOffset = startOffset;
|
||||||
} else {
|
} else {
|
||||||
// Make sure our input isn't messing up offsets:
|
// Make sure our input isn't messing up offsets:
|
||||||
assert startPosData.startOffset == startOffset;
|
assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int endOffset = offsetAtt.endOffset();
|
final int endOffset = offsetAtt.endOffset();
|
||||||
|
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
||||||
endPosData.endOffset = endOffset;
|
endPosData.endOffset = endOffset;
|
||||||
} else {
|
} else {
|
||||||
// Make sure our input isn't messing up offsets:
|
// Make sure our input isn't messing up offsets:
|
||||||
assert endPosData.endOffset == endOffset;
|
assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenPending = true;
|
tokenPending = true;
|
||||||
|
|
|
@ -0,0 +1,117 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
|
||||||
|
// nocommit better name...?
|
||||||
|
|
||||||
|
// nocommit BTSTC should just append this to the chain
|
||||||
|
// instead of checking itself:
|
||||||
|
|
||||||
|
/** A TokenFilter that checks consistency of the tokens (eg
|
||||||
|
* offsets are consistent with one another). */
|
||||||
|
public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private int pos;
|
||||||
|
|
||||||
|
// Maps position to the start/end offset:
|
||||||
|
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||||
|
private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
|
||||||
|
|
||||||
|
// nocommit must be more careful here? check hasAttribute first...?
|
||||||
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
/** The name arg is used to identify this stage when
|
||||||
|
* throwing exceptions (useful if you have more than one
|
||||||
|
* instance in your chain). */
|
||||||
|
public ValidatingTokenFilter(TokenStream in, String name) {
|
||||||
|
super(in);
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += posIncAtt.getPositionIncrement();
|
||||||
|
if (pos == -1) {
|
||||||
|
throw new IllegalStateException("first posInc must be > 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
final int startOffset = offsetAtt.startOffset();
|
||||||
|
final int endOffset = offsetAtt.endOffset();
|
||||||
|
|
||||||
|
final int posLen = posLenAtt.getPositionLength();
|
||||||
|
if (!posToStartOffset.containsKey(pos)) {
|
||||||
|
// First time we've seen a token leaving from this position:
|
||||||
|
posToStartOffset.put(pos, startOffset);
|
||||||
|
System.out.println(" + s " + pos + " -> " + startOffset);
|
||||||
|
} else {
|
||||||
|
// We've seen a token leaving from this position
|
||||||
|
// before; verify the startOffset is the same:
|
||||||
|
System.out.println(" + vs " + pos + " -> " + startOffset);
|
||||||
|
final int oldStartOffset = posToStartOffset.get(pos);
|
||||||
|
if (oldStartOffset != startOffset) {
|
||||||
|
throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final int endPos = pos + posLen;
|
||||||
|
|
||||||
|
if (!posToEndOffset.containsKey(endPos)) {
|
||||||
|
// First time we've seen a token arriving to this position:
|
||||||
|
posToEndOffset.put(endPos, endOffset);
|
||||||
|
System.out.println(" + e " + endPos + " -> " + endOffset);
|
||||||
|
} else {
|
||||||
|
// We've seen a token arriving to this position
|
||||||
|
// before; verify the endOffset is the same:
|
||||||
|
System.out.println(" + ve " + endPos + " -> " + endOffset);
|
||||||
|
final int oldEndOffset = posToEndOffset.get(endPos);
|
||||||
|
if (oldEndOffset != endOffset) {
|
||||||
|
throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: end? (what to validate?)
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
pos = -1;
|
||||||
|
posToStartOffset.clear();
|
||||||
|
posToEndOffset.clear();
|
||||||
|
}
|
||||||
|
}
|
|
@ -23,9 +23,10 @@ import java.util.LinkedList;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
|
|
||||||
|
@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
noShingleOutput = false;
|
noShingleOutput = false;
|
||||||
}
|
}
|
||||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||||
|
// nocommit is this right!? i'm just guessing...
|
||||||
|
posLenAtt.setPositionLength(builtGramSize);
|
||||||
isOutputHere = true;
|
isOutputHere = true;
|
||||||
gramSize.advance();
|
gramSize.advance();
|
||||||
tokenAvailable = true;
|
tokenAvailable = true;
|
||||||
|
|
|
@ -34,11 +34,11 @@ import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.IdentityHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.IdentityHashMap;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
|
@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
|
||||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.Rethrow;
|
import org.apache.lucene.util.Rethrow;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (c == ValidatingTokenFilter.class) {
|
||||||
|
// We insert this one ourselves after each stage...
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
for (final Constructor<?> ctor : c.getConstructors()) {
|
for (final Constructor<?> ctor : c.getConstructors()) {
|
||||||
// don't test deprecated ctors, they likely have known bugs:
|
// don't test deprecated ctors, they likely have known bugs:
|
||||||
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
|
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
|
||||||
|
@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
StringBuilder descr = new StringBuilder();
|
StringBuilder descr = new StringBuilder();
|
||||||
int numFilters = random.nextInt(5);
|
int numFilters = random.nextInt(5);
|
||||||
for (int i = 0; i < numFilters; i++) {
|
for (int i = 0; i < numFilters; i++) {
|
||||||
|
|
||||||
|
// Insert ValidatingTF after each stage so we can
|
||||||
|
// catch problems right after the TF that "caused"
|
||||||
|
// them:
|
||||||
|
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
||||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||||
|
@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Insert ValidatingTF after each stage so we can
|
||||||
|
// catch problems right after the TF that "caused"
|
||||||
|
// them:
|
||||||
|
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
|
||||||
|
|
||||||
spec.toString = descr.toString();
|
spec.toString = descr.toString();
|
||||||
return spec;
|
return spec;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue