LUCENE-8192: always enforce index-time offsets are correct with BaseTokenStreamTestCase

This commit is contained in:
Robert Muir 2018-03-26 22:02:14 -04:00
parent 8e68bdffeb
commit e595541ef3
1 changed files with 42 additions and 51 deletions

View File

@ -120,15 +120,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
// offsetsAreCorrect also validates:
// graphOffsetsAreCorrect validates:
// - graph offsets are correct (all tokens leaving from
// pos X have the same startOffset; all tokens
// arriving to pos Y have the same endOffset)
// - offsets only move forwards (startOffset >=
// lastStartOffset)
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -224,7 +222,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertNull("payloads " + i, payloads[i]);
}
}
if (posIncrAtt != null) {
if (i == 0) {
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
} else {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
}
}
if (posLengthAtt != null) {
assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
}
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
final int startOffset = offsetAtt.startOffset();
@ -235,12 +242,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
endOffset <= finalOffset.intValue());
}
if (offsetsAreCorrect) {
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
lastStartOffset = offsetAtt.startOffset();
}
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
lastStartOffset = offsetAtt.startOffset();
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
@ -275,16 +280,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
}
if (posIncrAtt != null) {
if (i == 0) {
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
} else {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
}
}
if (posLengthAtt != null) {
assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
}
}
if (ts.incrementToken()) {
@ -321,12 +316,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
boolean graphOffsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, graphOffsetsAreCorrect, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean graphOffsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, graphOffsetsAreCorrect);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
@ -389,15 +384,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
checkResetException(a, input);
checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
checkResetException(a, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
@ -505,7 +500,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
final Analyzer a;
final boolean useCharFilter;
final boolean simple;
final boolean offsetsAreCorrect;
final boolean graphOffsetsAreCorrect;
final RandomIndexWriter iw;
final CountDownLatch latch;
@ -514,14 +509,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// interact)... so this is just "best effort":
public boolean failed;
AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) {
this.seed = seed;
this.a = a;
this.iterations = iterations;
this.maxWordLength = maxWordLength;
this.useCharFilter = useCharFilter;
this.simple = simple;
this.offsetsAreCorrect = offsetsAreCorrect;
this.graphOffsetsAreCorrect = graphOffsetsAreCorrect;
this.iw = iw;
this.latch = latch;
}
@ -533,7 +528,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
latch.await();
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
success = true;
} catch (Exception e) {
Rethrow.rethrow(e);
@ -547,7 +542,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
checkRandomData(random, a, iterations, maxWordLength, simple, true);
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean graphOffsetsAreCorrect) throws IOException {
checkResetException(a, "best effort");
long seed = random.nextLong();
boolean useCharFilter = random.nextBoolean();
@ -563,14 +558,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
boolean success = false;
try {
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
// so this should only really fail from another thread if it's an actual thread problem
int numThreads = TestUtil.nextInt(random, 2, 4);
final CountDownLatch startingGun = new CountDownLatch(1);
AnalysisThread threads[] = new AnalysisThread[numThreads];
for (int i = 0; i < threads.length; i++) {
threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@ -601,7 +596,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
Document doc = null;
@ -626,11 +621,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
default:
if (offsetsAreCorrect) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
} else {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
}
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
}
currentField = field = new Field("dummy", bogus, ft);
doc.add(currentField);
@ -665,7 +656,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
try {
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField);
if (iw != null) {
if (random.nextInt(7) == 0) {
// pile up a multivalued field
@ -727,11 +718,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
checkAnalysisConsistency(random, a, useCharFilter, text, true);
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect) throws IOException {
checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null);
}
private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect, Field field) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@ -874,7 +865,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
toIntArray(positionLengths),
text.length(),
offsetsAreCorrect);
graphOffsetsAreCorrect);
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
@ -885,7 +876,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
null,
text.length(),
offsetsAreCorrect);
graphOffsetsAreCorrect);
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
@ -896,7 +887,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
toIntArray(positionLengths),
text.length(),
offsetsAreCorrect);
graphOffsetsAreCorrect);
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@ -907,7 +898,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
null,
text.length(),
offsetsAreCorrect);
graphOffsetsAreCorrect);
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
@ -918,7 +909,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
null,
null,
text.length(),
offsetsAreCorrect);
graphOffsetsAreCorrect);
} else {
// terms only
assertTokenStreamContents(ts,