mirror of https://github.com/apache/lucene.git
LUCENE-4127: don't let bogus offsets corrupt the index
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1348685 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
386cc7859d
commit
4110fae40b
|
@ -396,6 +396,8 @@ Changes in Runtime Behavior
|
|||
* LUCENE-4127: IndexWriter will now throw IllegalArgumentException if
|
||||
the first token of an indexed field has 0 positionIncrement
|
||||
(previously it silently corrected it to 1, possibly masking bugs).
|
||||
OffsetAttributeImpl will throw IllegalArgumentException if startOffset
|
||||
is less than endOffset, or if offsets are negative.
|
||||
(Robert Muir, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -49,14 +49,14 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
|||
}
|
||||
|
||||
public Token updateInputToken(Token inputToken, Token lastPrefixToken) {
|
||||
inputToken.setStartOffset(lastPrefixToken.endOffset() + inputToken.startOffset());
|
||||
inputToken.setEndOffset(lastPrefixToken.endOffset() + inputToken.endOffset());
|
||||
inputToken.setOffset(lastPrefixToken.endOffset() + inputToken.startOffset(),
|
||||
lastPrefixToken.endOffset() + inputToken.endOffset());
|
||||
return inputToken;
|
||||
}
|
||||
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
|
||||
suffixToken.setStartOffset(lastInputToken.endOffset() + suffixToken.startOffset());
|
||||
suffixToken.setEndOffset(lastInputToken.endOffset() + suffixToken.endOffset());
|
||||
suffixToken.setOffset(lastInputToken.endOffset() + suffixToken.startOffset(),
|
||||
lastInputToken.endOffset() + suffixToken.endOffset());
|
||||
return suffixToken;
|
||||
}
|
||||
|
||||
|
|
|
@ -153,8 +153,8 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
* @return consumer token
|
||||
*/
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) {
|
||||
suffixToken.setStartOffset(lastPrefixToken.endOffset() + suffixToken.startOffset());
|
||||
suffixToken.setEndOffset(lastPrefixToken.endOffset() + suffixToken.endOffset());
|
||||
suffixToken.setOffset(lastPrefixToken.endOffset() + suffixToken.startOffset(),
|
||||
lastPrefixToken.endOffset() + suffixToken.endOffset());
|
||||
return suffixToken;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,8 +27,8 @@ import org.apache.lucene.util.BytesRef;
|
|||
|
||||
|
||||
/**
|
||||
* Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)}
|
||||
* and {@link org.apache.lucene.analysis.Token#setEndOffset(int)}
|
||||
* Adds the {@link OffsetAttribute#startOffset()}
|
||||
* and {@link OffsetAttribute#endOffset()}
|
||||
* First 4 bytes are the start
|
||||
*
|
||||
**/
|
||||
|
|
|
@ -139,6 +139,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text */
|
||||
public Token(int start, int end) {
|
||||
checkOffsets(start, end);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
@ -149,6 +150,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param end end offset in the source text
|
||||
* @param typ the lexical type of this Token */
|
||||
public Token(int start, int end, String typ) {
|
||||
checkOffsets(start, end);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
type = typ;
|
||||
|
@ -162,6 +164,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param flags The bits to set for this token
|
||||
*/
|
||||
public Token(int start, int end, int flags) {
|
||||
checkOffsets(start, end);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
this.flags = flags;
|
||||
|
@ -177,6 +180,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param end end offset
|
||||
*/
|
||||
public Token(String text, int start, int end) {
|
||||
checkOffsets(start, end);
|
||||
append(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
|
@ -192,6 +196,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param typ token type
|
||||
*/
|
||||
public Token(String text, int start, int end, String typ) {
|
||||
checkOffsets(start, end);
|
||||
append(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
|
@ -209,6 +214,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param flags token type bits
|
||||
*/
|
||||
public Token(String text, int start, int end, int flags) {
|
||||
checkOffsets(start, end);
|
||||
append(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
|
@ -226,6 +232,7 @@ public class Token extends CharTermAttributeImpl
|
|||
* @param end
|
||||
*/
|
||||
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
|
||||
checkOffsets(start, end);
|
||||
copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
|
@ -295,12 +302,6 @@ public class Token extends CharTermAttributeImpl
|
|||
return startOffset;
|
||||
}
|
||||
|
||||
/** Set the starting offset.
|
||||
@see #startOffset() */
|
||||
public void setStartOffset(int offset) {
|
||||
this.startOffset = offset;
|
||||
}
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
|
@ -308,15 +309,10 @@ public class Token extends CharTermAttributeImpl
|
|||
return endOffset;
|
||||
}
|
||||
|
||||
/** Set the ending offset.
|
||||
@see #endOffset() */
|
||||
public void setEndOffset(int offset) {
|
||||
this.endOffset = offset;
|
||||
}
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
checkOffsets(startOffset, endOffset);
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
@ -449,11 +445,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #copyBuffer(char[], int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset},
|
||||
* {@link #setOffset},
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clearNoTermBuffer();
|
||||
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
|
||||
payload = null;
|
||||
|
@ -466,11 +462,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #copyBuffer(char[], int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setOffset},
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clearNoTermBuffer();
|
||||
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
|
@ -481,11 +477,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #append(CharSequence)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setOffset},
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clear();
|
||||
append(newTerm);
|
||||
startOffset = newStartOffset;
|
||||
|
@ -496,11 +492,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #append(CharSequence, int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setOffset},
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clear();
|
||||
append(newTerm, newTermOffset, newTermOffset + newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
|
@ -511,11 +507,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #append(CharSequence)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setOffset},
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clear();
|
||||
append(newTerm);
|
||||
startOffset = newStartOffset;
|
||||
|
@ -526,11 +522,11 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #append(CharSequence, int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setOffset},
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
|
||||
checkOffsets(newStartOffset, newEndOffset);
|
||||
clear();
|
||||
append(newTerm, newTermOffset, newTermOffset + newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
|
@ -614,6 +610,13 @@ public class Token extends CharTermAttributeImpl
|
|||
reflector.reflect(FlagsAttribute.class, "flags", flags);
|
||||
reflector.reflect(TypeAttribute.class, "type", type);
|
||||
}
|
||||
|
||||
private void checkOffsets(int startOffset, int endOffset) {
|
||||
if (startOffset < 0 || endOffset < startOffset) {
|
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
|
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
|
||||
}
|
||||
}
|
||||
|
||||
/** Convenience factory that returns <code>Token</code> as implementation for the basic
|
||||
* attributes and return the default impl (with "Impl" appended) for all other
|
||||
|
|
|
@ -47,10 +47,10 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
|
|||
// tokenizer should call clearAtts before re-using
|
||||
// OffsetAtt
|
||||
|
||||
// TODO: check that these are valid! IE, each should be
|
||||
// >= 0, and endOffset should be >= startOffset.
|
||||
// Problem is this could "break" existing
|
||||
// tokenizers/filters.
|
||||
if (startOffset < 0 || endOffset < startOffset) {
|
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
|
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
|
||||
}
|
||||
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
|
@ -80,6 +81,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
if (fieldType.omitNorms() && field.boost() != 1.0f) {
|
||||
throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + field.name() + "'");
|
||||
}
|
||||
|
||||
// only bother checking offsets if something will consume them.
|
||||
// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
|
||||
final boolean checkOffsets = fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
int lastStartOffset = 0;
|
||||
|
||||
if (i > 0) {
|
||||
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||
|
@ -134,6 +140,20 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
|
||||
if (posIncr == 0)
|
||||
fieldState.numOverlap++;
|
||||
|
||||
if (checkOffsets) {
|
||||
int startOffset = fieldState.offset + offsetAttribute.startOffset();
|
||||
int endOffset = fieldState.offset + offsetAttribute.endOffset();
|
||||
if (startOffset < 0 || endOffset < startOffset) {
|
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
|
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
|
||||
}
|
||||
if (startOffset < lastStartOffset) {
|
||||
throw new IllegalArgumentException("offsets must not go backwards startOffset="
|
||||
+ startOffset + " is < lastStartOffset=" + lastStartOffset);
|
||||
}
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
@ -39,6 +40,7 @@ import org.apache.lucene.search.FieldCache;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -382,6 +384,68 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
riw.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// NOTE: the next two tests aren't that good as we need an EvilToken...
|
||||
public void testNegativeOffsets() throws Exception {
|
||||
try {
|
||||
checkTokens(new Token[] {
|
||||
makeToken("foo", 1, -1, -1)
|
||||
});
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
//expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testIllegalOffsets() throws Exception {
|
||||
try {
|
||||
checkTokens(new Token[] {
|
||||
makeToken("foo", 1, 1, 0)
|
||||
});
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
//expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testBackwardsOffsets() throws Exception {
|
||||
try {
|
||||
checkTokens(new Token[] {
|
||||
makeToken("foo", 1, 0, 3),
|
||||
makeToken("foo", 1, 4, 7),
|
||||
makeToken("foo", 0, 3, 6)
|
||||
});
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
// TODO: more tests with other possibilities
|
||||
|
||||
private void checkTokens(Token[] tokens) throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
// store some term vectors for the checkindex cross-check
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("body", new CannedTokenStream(tokens), ft));
|
||||
riw.addDocument(doc);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(riw, dir);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(riw, dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
|
||||
final Token t = new Token();
|
||||
|
|
|
@ -175,10 +175,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (offsetAtt != null) {
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
assertTrue("startOffset must be >= 0", startOffset >= 0);
|
||||
assertTrue("endOffset must be >= 0", endOffset >= 0);
|
||||
assertTrue("endOffset must be >= startOffset, got startOffset=" + startOffset + ",endOffset=" + endOffset,
|
||||
endOffset >= startOffset);
|
||||
if (finalOffset != null) {
|
||||
assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
|
||||
|
|
|
@ -96,15 +96,6 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
|||
startOffset = offsetAtt.startOffset();
|
||||
endOffset = offsetAtt.endOffset();
|
||||
|
||||
if (startOffset < 0) {
|
||||
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
|
||||
}
|
||||
if (endOffset < 0) {
|
||||
throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
|
||||
}
|
||||
if (endOffset < startOffset) {
|
||||
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
|
||||
}
|
||||
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
|
||||
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
|
||||
}
|
||||
|
|
|
@ -186,8 +186,8 @@ public class SpellingQueryConverter extends QueryConverter {
|
|||
while (stream.incrementToken()) {
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setStartOffset(offset + offsetAtt.startOffset());
|
||||
token.setEndOffset(offset + offsetAtt.endOffset());
|
||||
token.setOffset(offset + offsetAtt.startOffset(),
|
||||
offset + offsetAtt.endOffset());
|
||||
token.setFlags(flagsAttValue); //overwriting any flags already set...
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
|
|
Loading…
Reference in New Issue