mirror of https://github.com/apache/lucene.git
LUCENE-9963 Add tests for alternate path failures in FlattenGraphFilter (#146)
Co-authored-by: Lawson <geoffrl@amazon.com>
This commit is contained in:
parent
3d833fdb66
commit
834041f286
|
@ -17,13 +17,22 @@
|
|||
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
||||
public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -314,5 +323,229 @@ public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
|
|||
11);
|
||||
}
|
||||
|
||||
// The end node the long path is supposed to flatten over doesn't exist
|
||||
// assert disabled = pos length of abc = 4
|
||||
// assert enabled = AssertionError: outputEndNode=3 vs inputTo=2
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
|
||||
public void testAltPathFirstStepHole() throws Exception {
|
||||
TokenStream in =
|
||||
new CannedTokenStream(
|
||||
0,
|
||||
3,
|
||||
new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)});
|
||||
|
||||
TokenStream out = new FlattenGraphFilter(in);
|
||||
|
||||
assertTokenStreamContents(
|
||||
out,
|
||||
new String[] {"abc", "b", "c"},
|
||||
new int[] {0, 1, 2},
|
||||
new int[] {3, 2, 3},
|
||||
new int[] {1, 1, 1},
|
||||
new int[] {3, 1, 1},
|
||||
3);
|
||||
}
|
||||
|
||||
// Last node in an alt path releases the long path. but it doesn't exist in this graph
|
||||
// pos length of abc = 1
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
|
||||
public void testAltPathLastStepHole() throws Exception {
|
||||
TokenStream in =
|
||||
new CannedTokenStream(
|
||||
0,
|
||||
4,
|
||||
new Token[] {
|
||||
token("abc", 1, 3, 0, 3),
|
||||
token("a", 0, 1, 0, 1),
|
||||
token("b", 1, 1, 1, 2),
|
||||
token("d", 2, 1, 3, 4)
|
||||
});
|
||||
|
||||
TokenStream out = new FlattenGraphFilter(in);
|
||||
|
||||
assertTokenStreamContents(
|
||||
out,
|
||||
new String[] {"abc", "a", "b", "d"},
|
||||
new int[] {0, 0, 1, 3},
|
||||
new int[] {1, 1, 2, 4},
|
||||
new int[] {1, 0, 1, 2},
|
||||
new int[] {3, 1, 1, 1},
|
||||
4);
|
||||
}
|
||||
|
||||
// Posinc >2 gets squashed to 2
|
||||
public void testLongHole() throws Exception {
|
||||
TokenStream in =
|
||||
new CannedTokenStream(
|
||||
0,
|
||||
28,
|
||||
new Token[] {
|
||||
token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28),
|
||||
});
|
||||
|
||||
TokenStream out = new FlattenGraphFilter(in);
|
||||
|
||||
assertTokenStreamContents(
|
||||
out,
|
||||
new String[] {"hello", "hole", "fun"},
|
||||
new int[] {0, 20, 25},
|
||||
new int[] {5, 24, 28},
|
||||
new int[] {1, 2, 1},
|
||||
new int[] {1, 1, 1},
|
||||
28);
|
||||
}
|
||||
|
||||
// multiple nodes missing in the alt path. Last edge shows up after long edge and short edge,
|
||||
// which looks good but the output graph isn't flat.
|
||||
// assert disabled = nothing
|
||||
// assert enabled = AssertionError
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
|
||||
public void testAltPathLastStepLongHole() throws Exception {
|
||||
TokenStream in =
|
||||
new CannedTokenStream(
|
||||
0,
|
||||
4,
|
||||
new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)});
|
||||
|
||||
TokenStream out = new FlattenGraphFilter(in);
|
||||
|
||||
assertTokenStreamContents(
|
||||
out,
|
||||
new String[] {"abc", "a", "d"},
|
||||
new int[] {0, 0, 3},
|
||||
new int[] {1, 1, 4},
|
||||
new int[] {1, 0, 1},
|
||||
new int[] {1, 1, 1},
|
||||
4);
|
||||
}
|
||||
|
||||
// LUCENE-8723
|
||||
// Token stream ends without last node showing up
|
||||
// assert disabled = dropped token
|
||||
// assert enabled = AssertionError: 2
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
|
||||
public void testAltPathLastStepHoleWithoutEndToken() throws Exception {
|
||||
TokenStream in =
|
||||
new CannedTokenStream(
|
||||
0,
|
||||
2,
|
||||
new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)});
|
||||
|
||||
TokenStream out = new FlattenGraphFilter(in);
|
||||
|
||||
assertTokenStreamContents(
|
||||
out,
|
||||
new String[] {"abc", "a", "b"},
|
||||
new int[] {0, 0, 1},
|
||||
new int[] {1, 1, 2},
|
||||
new int[] {1, 0, 1},
|
||||
new int[] {1, 1, 1},
|
||||
2);
|
||||
}
|
||||
|
||||
/**
|
||||
* build CharsRef containing 2-4 tokens
|
||||
*
|
||||
* @param tokens vocabulary of tokens
|
||||
* @param charsRefBuilder CharsRefBuilder
|
||||
* @param random Random for selecting tokens
|
||||
* @return Charsref containing 2-4 tokens.
|
||||
*/
|
||||
private CharsRef buildMultiTokenCharsRef(
|
||||
String[] tokens, CharsRefBuilder charsRefBuilder, Random random) {
|
||||
int srcLen = random.nextInt(2) + 2;
|
||||
String[] srcTokens = new String[srcLen];
|
||||
for (int pos = 0; pos < srcLen; pos++) {
|
||||
srcTokens[pos] = tokens[random().nextInt(tokens.length)];
|
||||
}
|
||||
SynonymMap.Builder.join(srcTokens, charsRefBuilder);
|
||||
return charsRefBuilder.toCharsRef();
|
||||
}
|
||||
|
||||
// Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter
|
||||
// Is there some way we can do this and validate output nodes?
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
|
||||
public void testRandomGraphs() throws Exception {
|
||||
String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"};
|
||||
String[] synTokens = new String[] {"s1", "s2", "s3", "s4"};
|
||||
|
||||
SynonymMap.Builder mapBuilder = new SynonymMap.Builder();
|
||||
CharsRefBuilder charRefBuilder = new CharsRefBuilder();
|
||||
Random random = random();
|
||||
|
||||
// between 20 and 20 synonym entries
|
||||
int synCount = random.nextInt(10) + 10;
|
||||
for (int i = 0; i < synCount; i++) {
|
||||
int type = random.nextInt(4);
|
||||
CharsRef src;
|
||||
CharsRef dest;
|
||||
switch (type) {
|
||||
case 0:
|
||||
// 1:1
|
||||
src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
|
||||
charRefBuilder.clear();
|
||||
dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
|
||||
charRefBuilder.clear();
|
||||
break;
|
||||
case 1:
|
||||
// many:1
|
||||
src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
|
||||
charRefBuilder.clear();
|
||||
dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
|
||||
charRefBuilder.clear();
|
||||
break;
|
||||
case 2:
|
||||
// 1:many
|
||||
src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
|
||||
charRefBuilder.clear();
|
||||
dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
|
||||
charRefBuilder.clear();
|
||||
break;
|
||||
default:
|
||||
// many:many
|
||||
src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
|
||||
charRefBuilder.clear();
|
||||
dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
|
||||
charRefBuilder.clear();
|
||||
}
|
||||
mapBuilder.add(src, dest, true);
|
||||
}
|
||||
|
||||
SynonymMap synMap = mapBuilder.build();
|
||||
|
||||
int stopWordCount = random.nextInt(4) + 1;
|
||||
CharArraySet stopWords = new CharArraySet(stopWordCount, true);
|
||||
while (stopWords.size() < stopWordCount) {
|
||||
int index = random.nextInt(baseTokens.length + synTokens.length);
|
||||
String[] tokenArray = baseTokens;
|
||||
if (index >= baseTokens.length) {
|
||||
index -= baseTokens.length;
|
||||
tokenArray = synTokens;
|
||||
}
|
||||
stopWords.add(tokenArray[index]);
|
||||
}
|
||||
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer in = new WhitespaceTokenizer();
|
||||
TokenStream result = new SynonymGraphFilter(in, synMap, true);
|
||||
result = new StopFilter(result, stopWords);
|
||||
result = new FlattenGraphFilter(result);
|
||||
return new TokenStreamComponents(in, result);
|
||||
}
|
||||
};
|
||||
|
||||
int tokenCount = random.nextInt(20) + 20;
|
||||
List<String> stringTokens = new ArrayList<>();
|
||||
while (stringTokens.size() < tokenCount) {
|
||||
stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]);
|
||||
}
|
||||
|
||||
String text = String.join(" ", stringTokens);
|
||||
checkAnalysisConsistency(random, a, false, text);
|
||||
}
|
||||
|
||||
// NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue