LUCENE-9963 Add tests for alternate path failures in FlattenGraphFilter (#146)

Co-authored-by: Lawson <geoffrl@amazon.com>
This commit is contained in:
Geoffrey Lawson 2021-06-29 01:04:04 +09:00 committed by GitHub
parent 3d833fdb66
commit 834041f286
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 233 additions and 0 deletions

View File

@ -17,13 +17,22 @@
package org.apache.lucene.analysis.core;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@ -314,5 +323,229 @@ public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
11);
}
// The end node the long path is supposed to flatten over doesn't exist
// assert disabled = pos length of abc = 4
// assert enabled = AssertionError: outputEndNode=3 vs inputTo=2
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
public void testAltPathFirstStepHole() throws Exception {
TokenStream in =
new CannedTokenStream(
0,
3,
new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(
out,
new String[] {"abc", "b", "c"},
new int[] {0, 1, 2},
new int[] {3, 2, 3},
new int[] {1, 1, 1},
new int[] {3, 1, 1},
3);
}
// Last node in an alt path releases the long path. but it doesn't exist in this graph
// pos length of abc = 1
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
public void testAltPathLastStepHole() throws Exception {
TokenStream in =
new CannedTokenStream(
0,
4,
new Token[] {
token("abc", 1, 3, 0, 3),
token("a", 0, 1, 0, 1),
token("b", 1, 1, 1, 2),
token("d", 2, 1, 3, 4)
});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(
out,
new String[] {"abc", "a", "b", "d"},
new int[] {0, 0, 1, 3},
new int[] {1, 1, 2, 4},
new int[] {1, 0, 1, 2},
new int[] {3, 1, 1, 1},
4);
}
// Posinc >2 gets squashed to 2
public void testLongHole() throws Exception {
TokenStream in =
new CannedTokenStream(
0,
28,
new Token[] {
token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28),
});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(
out,
new String[] {"hello", "hole", "fun"},
new int[] {0, 20, 25},
new int[] {5, 24, 28},
new int[] {1, 2, 1},
new int[] {1, 1, 1},
28);
}
// multiple nodes missing in the alt path. Last edge shows up after long edge and short edge,
// which looks good but the output graph isn't flat.
// assert disabled = nothing
// assert enabled = AssertionError
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
public void testAltPathLastStepLongHole() throws Exception {
TokenStream in =
new CannedTokenStream(
0,
4,
new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(
out,
new String[] {"abc", "a", "d"},
new int[] {0, 0, 3},
new int[] {1, 1, 4},
new int[] {1, 0, 1},
new int[] {1, 1, 1},
4);
}
// LUCENE-8723
// Token stream ends without last node showing up
// assert disabled = dropped token
// assert enabled = AssertionError: 2
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
public void testAltPathLastStepHoleWithoutEndToken() throws Exception {
TokenStream in =
new CannedTokenStream(
0,
2,
new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(
out,
new String[] {"abc", "a", "b"},
new int[] {0, 0, 1},
new int[] {1, 1, 2},
new int[] {1, 0, 1},
new int[] {1, 1, 1},
2);
}
/**
* build CharsRef containing 2-4 tokens
*
* @param tokens vocabulary of tokens
* @param charsRefBuilder CharsRefBuilder
* @param random Random for selecting tokens
* @return Charsref containing 2-4 tokens.
*/
private CharsRef buildMultiTokenCharsRef(
String[] tokens, CharsRefBuilder charsRefBuilder, Random random) {
int srcLen = random.nextInt(2) + 2;
String[] srcTokens = new String[srcLen];
for (int pos = 0; pos < srcLen; pos++) {
srcTokens[pos] = tokens[random().nextInt(tokens.length)];
}
SynonymMap.Builder.join(srcTokens, charsRefBuilder);
return charsRefBuilder.toCharsRef();
}
// Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter
// Is there some way we can do this and validate output nodes?
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
public void testRandomGraphs() throws Exception {
String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"};
String[] synTokens = new String[] {"s1", "s2", "s3", "s4"};
SynonymMap.Builder mapBuilder = new SynonymMap.Builder();
CharsRefBuilder charRefBuilder = new CharsRefBuilder();
Random random = random();
// between 20 and 20 synonym entries
int synCount = random.nextInt(10) + 10;
for (int i = 0; i < synCount; i++) {
int type = random.nextInt(4);
CharsRef src;
CharsRef dest;
switch (type) {
case 0:
// 1:1
src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
charRefBuilder.clear();
dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
charRefBuilder.clear();
break;
case 1:
// many:1
src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
charRefBuilder.clear();
dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
charRefBuilder.clear();
break;
case 2:
// 1:many
src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
charRefBuilder.clear();
dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
charRefBuilder.clear();
break;
default:
// many:many
src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
charRefBuilder.clear();
dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
charRefBuilder.clear();
}
mapBuilder.add(src, dest, true);
}
SynonymMap synMap = mapBuilder.build();
int stopWordCount = random.nextInt(4) + 1;
CharArraySet stopWords = new CharArraySet(stopWordCount, true);
while (stopWords.size() < stopWordCount) {
int index = random.nextInt(baseTokens.length + synTokens.length);
String[] tokenArray = baseTokens;
if (index >= baseTokens.length) {
index -= baseTokens.length;
tokenArray = synTokens;
}
stopWords.add(tokenArray[index]);
}
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer in = new WhitespaceTokenizer();
TokenStream result = new SynonymGraphFilter(in, synMap, true);
result = new StopFilter(result, stopWords);
result = new FlattenGraphFilter(result);
return new TokenStreamComponents(in, result);
}
};
int tokenCount = random.nextInt(20) + 20;
List<String> stringTokens = new ArrayList<>();
while (stringTokens.size() < tokenCount) {
stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]);
}
String text = String.join(" ", stringTokens);
checkAnalysisConsistency(random, a, false, text);
}
// NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
}