Improved percolator's random candidate query duel test and

fixed bugs that were exposed by this:

* Duplicates query leafs were not detected in a multi level boolean query
* Tracking fields for numeric range queries did not work properly.
* The sorting that was used to find the less restrictive clauses in
  disjunction query did not work too.
This commit is contained in:
Martijn van Groningen 2018-02-26 16:03:38 +01:00
parent bc8b3fc71c
commit bcfb7ab591
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
3 changed files with 199 additions and 98 deletions

View File

@ -54,6 +54,7 @@ import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -61,6 +62,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.toSet;
@ -366,36 +368,38 @@ final class QueryAnalyzer {
Set<QueryExtraction> extractions = new HashSet<>();
Set<String> seenRangeFields = new HashSet<>();
for (Result result : results) {
QueryExtraction[] t = result.extractions.toArray(new QueryExtraction[1]);
if (result.extractions.size() == 1 && t[0].range != null) {
// In case of range queries each extraction does not simply increment the minimum_should_match
// for that percolator query like for a term based extraction, so that can lead to more false
// positives for percolator queries with range queries than term based queries.
// The is because the way number fields are extracted from the document to be percolated.
// Per field a single range is extracted and if a percolator query has two or more range queries
// on the same field, then the minimum should match can be higher than clauses in the CoveringQuery.
// Therefore right now the minimum should match is incremented once per number field when processing
// the percolator query at index time.
if (seenRangeFields.add(t[0].range.fieldName)) {
msm += 1;
}
} else {
// In case that there are duplicate query extractions we need to be careful with incrementing msm,
// because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:
int resultMsm = result.minimumShouldMatch;
for (QueryExtraction queryExtraction : result.extractions) {
if (extractions.contains(queryExtraction)) {
// To protect against negative msm:
// (sub results could consist out of disjunction and conjunction and
// then we do not know which extraction contributed to msm)
resultMsm = Math.max(0, resultMsm - 1);
// In case that there are duplicate query extractions we need to be careful with incrementing msm,
// because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:
int resultMsm = result.minimumShouldMatch;
for (QueryExtraction queryExtraction : result.extractions) {
if (queryExtraction.range != null) {
// In case of range queries each extraction does not simply increment the minimum_should_match
// for that percolator query like for a term based extraction, so that can lead to more false
// positives for percolator queries with range queries than term based queries.
// The is because the way number fields are extracted from the document to be percolated.
// Per field a single range is extracted and if a percolator query has two or more range queries
// on the same field, then the minimum should match can be higher than clauses in the CoveringQuery.
// Therefore right now the minimum should match is incremented once per number field when processing
// the percolator query at index time.
if (seenRangeFields.add(queryExtraction.range.fieldName)) {
resultMsm = 1;
} else {
resultMsm = 0;
}
}
msm += resultMsm;
if (extractions.contains(queryExtraction)) {
// To protect against negative msm:
// (sub results could consist out of disjunction and conjunction and
// then we do not know which extraction contributed to msm)
resultMsm = Math.max(0, resultMsm - 1);
}
}
msm += resultMsm;
verified &= result.verified;
matchAllDocs &= result.matchAllDocs;
extractions.addAll(result.extractions);
@ -518,8 +522,7 @@ final class QueryAnalyzer {
private static Result handleDisjunction(List<Query> disjunctions, int requiredShouldClauses, boolean otherClauses,
Version version) {
// Keep track of the msm for each clause:
int[] msmPerClause = new int[disjunctions.size()];
String[] rangeFieldNames = new String[disjunctions.size()];
List<DisjunctionClause> clauses = new ArrayList<>(disjunctions.size());
boolean verified = otherClauses == false;
if (version.before(Version.V_6_1_0)) {
verified &= requiredShouldClauses <= 1;
@ -535,17 +538,14 @@ final class QueryAnalyzer {
}
int resultMsm = subResult.minimumShouldMatch;
for (QueryExtraction extraction : subResult.extractions) {
if (terms.contains(extraction)) {
resultMsm = Math.max(1, resultMsm - 1);
if (terms.add(extraction) == false) {
resultMsm = Math.max(0, resultMsm - 1);
}
}
msmPerClause[i] = resultMsm;
terms.addAll(subResult.extractions);
QueryExtraction[] t = subResult.extractions.toArray(new QueryExtraction[1]);
if (subResult.extractions.size() == 1 && t[0].range != null) {
rangeFieldNames[i] = t[0].range.fieldName;
}
clauses.add(new DisjunctionClause(resultMsm, subResult.extractions.stream()
.filter(extraction -> extraction.range != null)
.map(extraction -> extraction.range.fieldName)
.collect(toSet())));
}
boolean matchAllDocs = numMatchAllClauses > 0 && numMatchAllClauses >= requiredShouldClauses;
@ -554,15 +554,20 @@ final class QueryAnalyzer {
Set<String> seenRangeFields = new HashSet<>();
// Figure out what the combined msm is for this disjunction:
// (sum the lowest required clauses, otherwise we're too strict and queries may not match)
Arrays.sort(msmPerClause);
int limit = Math.min(msmPerClause.length, Math.max(1, requiredShouldClauses));
clauses = clauses.stream()
.filter(o -> o.msm > 0)
.sorted(Comparator.comparingInt(o -> o.msm))
.collect(Collectors.toList());
int limit = Math.min(clauses.size(), Math.max(1, requiredShouldClauses));
for (int i = 0; i < limit; i++) {
if (rangeFieldNames[i] != null) {
if (seenRangeFields.add(rangeFieldNames[i])) {
msm += 1;
if (clauses.get(i).rangeFieldNames.isEmpty() == false) {
for (String rangeField: clauses.get(i).rangeFieldNames) {
if (seenRangeFields.add(rangeField)) {
msm += 1;
}
}
} else {
msm += msmPerClause[i];
msm += clauses.get(i).msm;
}
}
} else {
@ -575,6 +580,17 @@ final class QueryAnalyzer {
}
}
static class DisjunctionClause {
final int msm;
final Set<String> rangeFieldNames;
DisjunctionClause(int msm, Set<String> rangeFieldNames) {
this.msm = msm;
this.rangeFieldNames = rangeFieldNames;
}
}
static Set<QueryExtraction> selectBestExtraction(Set<QueryExtraction> extractions1, Set<QueryExtraction> extractions2) {
assert extractions1 != null || extractions2 != null;
if (extractions1 == null) {

View File

@ -28,6 +28,7 @@ import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.document.InetAddressPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
@ -36,7 +37,9 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.BlendedTermQuery;
@ -44,12 +47,10 @@ import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.CoveringQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FilterScorer;
import org.apache.lucene.search.FilteredDocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
@ -76,11 +77,14 @@ import org.elasticsearch.common.CheckedFunction;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.geo.ShapeRelation;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.NumberFieldMapper;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
@ -166,59 +170,53 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
public void testDuel() throws Exception {
int numFields = randomIntBetween(1, 3);
Map<String, List<String>> content = new HashMap<>();
Map<String, List<String>> stringContent = new HashMap<>();
for (int i = 0; i < numFields; i++) {
int numTokens = randomIntBetween(1, 64);
List<String> values = new ArrayList<>();
for (int j = 0; j < numTokens; j++) {
values.add(randomAlphaOfLength(8));
}
content.put("field" + i, values);
stringContent.put("field" + i, values);
}
List<String> fields = new ArrayList<>(content.keySet());
List<String> stringFields = new ArrayList<>(stringContent.keySet());
int numValues = randomIntBetween(16, 64);
List<Integer> intValues = new ArrayList<>(numValues);
for (int j = 0; j < numValues; j++) {
intValues.add(randomInt());
}
Collections.sort(intValues);
MappedFieldType intFieldType = mapperService.documentMapper("type").mappers()
.getMapper("int_field").fieldType();
List<Supplier<Query>> queryFunctions = new ArrayList<>();
queryFunctions.add(MatchNoDocsQuery::new);
queryFunctions.add(MatchAllDocsQuery::new);
queryFunctions.add(() -> new TermQuery(new Term("unknown_field", "value")));
String field1 = randomFrom(fields);
queryFunctions.add(() -> new TermQuery(new Term(field1, randomFrom(content.get(field1)))));
String field2 = randomFrom(fields);
queryFunctions.add(() -> new TermQuery(new Term(field2, randomFrom(content.get(field2)))));
queryFunctions.add(() -> new TermInSetQuery(field1, new BytesRef(randomFrom(content.get(field1))),
new BytesRef(randomFrom(content.get(field1)))));
queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(content.get(field1))),
new BytesRef(randomFrom(content.get(field1)))));
queryFunctions.add(() -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
int numClauses = randomIntBetween(1, 16);
for (int i = 0; i < numClauses; i++) {
if (rarely()) {
if (randomBoolean()) {
Occur occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur);
} else {
String field = randomFrom(fields);
builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), Occur.MUST_NOT);
}
} else {
if (randomBoolean()) {
Occur occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
String field = randomFrom(fields);
builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur);
} else {
builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), Occur.MUST_NOT);
}
}
}
return builder.build();
});
String field1 = randomFrom(stringFields);
queryFunctions.add(() -> new TermQuery(new Term(field1, randomFrom(stringContent.get(field1)))));
String field2 = randomFrom(stringFields);
queryFunctions.add(() -> new TermQuery(new Term(field2, randomFrom(stringContent.get(field2)))));
queryFunctions.add(() -> intFieldType.termQuery(randomFrom(intValues), null));
queryFunctions.add(() -> intFieldType.termsQuery(Arrays.asList(randomFrom(intValues), randomFrom(intValues)), null));
queryFunctions.add(() -> intFieldType.rangeQuery(intValues.get(4), intValues.get(intValues.size() - 4), true,
true, ShapeRelation.WITHIN, null, null, null));
queryFunctions.add(() -> new TermInSetQuery(field1, new BytesRef(randomFrom(stringContent.get(field1))),
new BytesRef(randomFrom(stringContent.get(field1)))));
queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(stringContent.get(field1))),
new BytesRef(randomFrom(stringContent.get(field1)))));
int numRandomBoolQueries = randomIntBetween(16, 32);
for (int i = 0; i < numRandomBoolQueries; i++) {
queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues));
}
queryFunctions.add(() -> {
int numClauses = randomIntBetween(1, 16);
List<Query> clauses = new ArrayList<>();
for (int i = 0; i < numClauses; i++) {
String field = randomFrom(fields);
clauses.add(new TermQuery(new Term(field, randomFrom(content.get(field)))));
String field = randomFrom(stringFields);
clauses.add(new TermQuery(new Term(field, randomFrom(stringContent.get(field)))));
}
return new DisjunctionMaxQuery(clauses, 0.01f);
});
@ -237,14 +235,75 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
shardSearcher.setQueryCache(null);
Document document = new Document();
for (Map.Entry<String, List<String>> entry : content.entrySet()) {
for (Map.Entry<String, List<String>> entry : stringContent.entrySet()) {
String value = entry.getValue().stream().collect(Collectors.joining(" "));
document.add(new TextField(entry.getKey(), value, Field.Store.NO));
}
for (Integer intValue : intValues) {
List<Field> numberFields =
NumberFieldMapper.NumberType.INTEGER.createFields("int_field", intValue, true, true, false);
for (Field numberField : numberFields) {
document.add(numberField);
}
}
MemoryIndex memoryIndex = MemoryIndex.fromDocument(document, new WhitespaceAnalyzer());
duelRun(queryStore, memoryIndex, shardSearcher);
}
private BooleanQuery createRandomBooleanQuery(int depth, List<String> fields, Map<String, List<String>> content,
MappedFieldType intFieldType, List<Integer> intValues) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
int numClauses = randomIntBetween(1, 16);
int numShouldClauses = 0;
boolean onlyShouldClauses = rarely();
for (int i = 0; i < numClauses; i++) {
Occur occur;
if (onlyShouldClauses) {
occur = Occur.SHOULD;
if (randomBoolean()) {
String field = randomFrom(fields);
builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur);
} else {
builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
}
} else if (rarely() && depth <= 3) {
occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
builder.add(createRandomBooleanQuery(depth + 1, fields, content, intFieldType, intValues), occur);
} else if (rarely()) {
if (randomBoolean()) {
occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
if (randomBoolean()) {
builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur);
} else {
builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
}
} else if (randomBoolean()) {
String field = randomFrom(fields);
builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur = Occur.MUST_NOT);
} else {
builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur = Occur.MUST_NOT);
}
} else {
if (randomBoolean()) {
occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
if (randomBoolean()) {
String field = randomFrom(fields);
builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur);
} else {
builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
}
} else {
builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur = Occur.MUST_NOT);
}
}
if (occur == Occur.SHOULD) {
numShouldClauses++;
}
}
builder.setMinimumNumberShouldMatch(numShouldClauses);
return builder.build();
}
public void testDuelIdBased() throws Exception {
List<Function<String, Query>> queryFunctions = new ArrayList<>();
queryFunctions.add((id) -> new PrefixQuery(new Term("field", id)));
@ -766,11 +825,11 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
Query percolateQuery = fieldType.percolateQuery("_name", queryStore,
Collections.singletonList(new BytesArray("{}")), percolateSearcher, Version.CURRENT);
Query query = requireScore ? percolateQuery : new ConstantScoreQuery(percolateQuery);
TopDocs topDocs = shardSearcher.search(query, 10);
TopDocs topDocs = shardSearcher.search(query, 100);
Query controlQuery = new ControlQuery(memoryIndex, queryStore);
controlQuery = requireScore ? controlQuery : new ConstantScoreQuery(controlQuery);
TopDocs controlTopDocs = shardSearcher.search(controlQuery, 10);
TopDocs controlTopDocs = shardSearcher.search(controlQuery, 100);
try {
assertThat(topDocs.totalHits, equalTo(controlTopDocs.totalHits));
@ -793,22 +852,39 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
logger.error("controlTopDocs.scoreDocs.length={}", controlTopDocs.scoreDocs.length);
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
logger.error("topDocs.scoreDocs[j].doc={}", topDocs.scoreDocs[i].doc);
logger.error("topDocs.scoreDocs[j].score={}", topDocs.scoreDocs[i].score);
logger.error("topDocs.scoreDocs[{}].doc={}", i, topDocs.scoreDocs[i].doc);
logger.error("topDocs.scoreDocs[{}].score={}", i, topDocs.scoreDocs[i].score);
}
for (int i = 0; i < controlTopDocs.scoreDocs.length; i++) {
logger.error("controlTopDocs.scoreDocs[j].doc={}", controlTopDocs.scoreDocs[i].doc);
logger.error("controlTopDocs.scoreDocs[j].score={}", controlTopDocs.scoreDocs[i].score);
logger.error("controlTopDocs.scoreDocs[{}].doc={}", i, controlTopDocs.scoreDocs[i].doc);
logger.error("controlTopDocs.scoreDocs[{}].score={}", i, controlTopDocs.scoreDocs[i].score);
// Additional stored information that is useful when debugging:
String queryToString = shardSearcher.doc(controlTopDocs.scoreDocs[i].doc).get("query_to_string");
logger.error("topDocs.scoreDocs[{}].query_to_string={}", i, queryToString);
NumericDocValues numericValues =
MultiDocValues.getNumericValues(shardSearcher.getIndexReader(), fieldType.minimumShouldMatchField.name());
boolean exact = numericValues.advanceExact(controlTopDocs.scoreDocs[i].doc);
if (exact) {
logger.error("controlTopDocs.scoreDocs[{}].minimum_should_match_field={}", i, numericValues.longValue());
} else {
// Some queries do not have a msm field. (e.g. unsupported queries)
logger.error("controlTopDocs.scoreDocs[{}].minimum_should_match_field=[NO_VALUE]", i);
}
}
throw ae;
}
}
private void addQuery(Query query, List<ParseContext.Document> docs) throws IOException {
private void addQuery(Query query, List<ParseContext.Document> docs) {
ParseContext.InternalParseContext parseContext = new ParseContext.InternalParseContext(Settings.EMPTY,
mapperService.documentMapperParser(), documentMapper, null, null);
fieldMapper.processQuery(query, parseContext);
docs.add(parseContext.doc());
ParseContext.Document queryDocument = parseContext.doc();
// Add to string representation of the query to make debugging easier:
queryDocument.add(new StoredField("query_to_string", query.toString()));
docs.add(queryDocument);
queries.add(query);
}
@ -865,8 +941,6 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
final IndexSearcher percolatorIndexSearcher = memoryIndex.createSearcher();
return new Weight(this) {
float _score;
@Override
public void extractTerms(Set<Term> terms) {}
@ -889,6 +963,7 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
float _score[] = new float[]{boost};
DocIdSetIterator allDocs = DocIdSetIterator.all(context.reader().maxDoc());
CheckedFunction<Integer, Query, IOException> leaf = queryStore.getQueries(context);
FilteredDocIdSetIterator memoryIndexIterator = new FilteredDocIdSetIterator(allDocs) {
@ -900,7 +975,7 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
TopDocs topDocs = percolatorIndexSearcher.search(query, 1);
if (topDocs.totalHits > 0) {
if (needsScores) {
_score = topDocs.scoreDocs[0].score;
_score[0] = topDocs.scoreDocs[0].score;
}
return true;
} else {
@ -911,11 +986,21 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
}
}
};
return new FilterScorer(new ConstantScoreScorer(this, 1f, memoryIndexIterator)) {
return new Scorer(this) {
@Override
public int docID() {
return memoryIndexIterator.docID();
}
@Override
public DocIdSetIterator iterator() {
return memoryIndexIterator;
}
@Override
public float score() throws IOException {
return _score;
return _score[0];
}
};
}

View File

@ -922,7 +922,7 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
assertThat(values.get(3), equalTo("field\0value4"));
assertThat(values.get(4), equalTo("field\0value5"));
msm = doc.rootDoc().getFields(fieldType.minimumShouldMatchField.name())[0].numericValue().intValue();
assertThat(msm, equalTo(3));
assertThat(msm, equalTo(1));
}
private static byte[] subByteArray(byte[] source, int offset, int length) {