mirror of https://github.com/apache/lucene.git
LUCENE-8192: always enforce index-time offsets are correct with BaseTokenStreamTestCase
This commit is contained in:
parent
8e68bdffeb
commit
e595541ef3
|
@ -120,15 +120,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// offsetsAreCorrect also validates:
|
||||
// graphOffsetsAreCorrect validates:
|
||||
// - graph offsets are correct (all tokens leaving from
|
||||
// pos X have the same startOffset; all tokens
|
||||
// arriving to pos Y have the same endOffset)
|
||||
// - offsets only move forwards (startOffset >=
|
||||
// lastStartOffset)
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
|
||||
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
|
||||
boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
assertNotNull(output);
|
||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
|
@ -224,7 +222,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertNull("payloads " + i, payloads[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (posIncrAtt != null) {
|
||||
if (i == 0) {
|
||||
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
|
||||
} else {
|
||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||
}
|
||||
}
|
||||
if (posLengthAtt != null) {
|
||||
assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
// we can enforce some basic things about a few attributes even if the caller doesn't check:
|
||||
if (offsetAtt != null) {
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
|
@ -235,12 +242,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
endOffset <= finalOffset.intValue());
|
||||
}
|
||||
|
||||
if (offsetsAreCorrect) {
|
||||
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
|
||||
lastStartOffset = offsetAtt.startOffset();
|
||||
}
|
||||
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
|
||||
lastStartOffset = offsetAtt.startOffset();
|
||||
|
||||
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
|
||||
if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
|
||||
// Validate offset consistency in the graph, ie
|
||||
// all tokens leaving from a certain pos have the
|
||||
// same startOffset, and all tokens arriving to a
|
||||
|
@ -275,16 +280,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (posIncrAtt != null) {
|
||||
if (i == 0) {
|
||||
assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
|
||||
} else {
|
||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||
}
|
||||
}
|
||||
if (posLengthAtt != null) {
|
||||
assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ts.incrementToken()) {
|
||||
|
@ -321,12 +316,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
|
||||
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
|
||||
boolean offsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
|
||||
boolean graphOffsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, graphOffsetsAreCorrect, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean graphOffsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, graphOffsetsAreCorrect);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||
|
@ -389,15 +384,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
|
||||
checkResetException(a, input);
|
||||
checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
|
||||
checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
checkResetException(a, input);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||
|
@ -505,7 +500,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
final Analyzer a;
|
||||
final boolean useCharFilter;
|
||||
final boolean simple;
|
||||
final boolean offsetsAreCorrect;
|
||||
final boolean graphOffsetsAreCorrect;
|
||||
final RandomIndexWriter iw;
|
||||
final CountDownLatch latch;
|
||||
|
||||
|
@ -514,14 +509,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// interact)... so this is just "best effort":
|
||||
public boolean failed;
|
||||
|
||||
AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
|
||||
AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) {
|
||||
this.seed = seed;
|
||||
this.a = a;
|
||||
this.iterations = iterations;
|
||||
this.maxWordLength = maxWordLength;
|
||||
this.useCharFilter = useCharFilter;
|
||||
this.simple = simple;
|
||||
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||
this.graphOffsetsAreCorrect = graphOffsetsAreCorrect;
|
||||
this.iw = iw;
|
||||
this.latch = latch;
|
||||
}
|
||||
|
@ -533,7 +528,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
latch.await();
|
||||
// see the part in checkRandomData where it replays the same text again
|
||||
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
|
||||
success = true;
|
||||
} catch (Exception e) {
|
||||
Rethrow.rethrow(e);
|
||||
|
@ -547,7 +542,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
checkRandomData(random, a, iterations, maxWordLength, simple, true);
|
||||
}
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean graphOffsetsAreCorrect) throws IOException {
|
||||
checkResetException(a, "best effort");
|
||||
long seed = random.nextLong();
|
||||
boolean useCharFilter = random.nextBoolean();
|
||||
|
@ -563,14 +558,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
boolean success = false;
|
||||
try {
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
|
||||
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
|
||||
// so this should only really fail from another thread if it's an actual thread problem
|
||||
int numThreads = TestUtil.nextInt(random, 2, 4);
|
||||
final CountDownLatch startingGun = new CountDownLatch(1);
|
||||
AnalysisThread threads[] = new AnalysisThread[numThreads];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i].start();
|
||||
|
@ -601,7 +596,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) throws IOException {
|
||||
|
||||
final LineFileDocs docs = new LineFileDocs(random);
|
||||
Document doc = null;
|
||||
|
@ -626,11 +621,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
|
||||
case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
|
||||
default:
|
||||
if (offsetsAreCorrect) {
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
} else {
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
}
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
}
|
||||
currentField = field = new Field("dummy", bogus, ft);
|
||||
doc.add(currentField);
|
||||
|
@ -665,7 +656,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
try {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField);
|
||||
if (iw != null) {
|
||||
if (random.nextInt(7) == 0) {
|
||||
// pile up a multivalued field
|
||||
|
@ -727,11 +718,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
checkAnalysisConsistency(random, a, useCharFilter, text, true);
|
||||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect) throws IOException {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null);
|
||||
}
|
||||
|
||||
private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
|
||||
private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect, Field field) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
|
@ -874,7 +865,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
graphOffsetsAreCorrect);
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -885,7 +876,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(positions),
|
||||
null,
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
graphOffsetsAreCorrect);
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -896,7 +887,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
graphOffsetsAreCorrect);
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -907,7 +898,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(positions),
|
||||
null,
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
graphOffsetsAreCorrect);
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -918,7 +909,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
null,
|
||||
null,
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
graphOffsetsAreCorrect);
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
Loading…
Reference in New Issue