From bcfb7ab5916132181613c674f8e2b6423030eac5 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Mon, 26 Feb 2018 16:03:38 +0100 Subject: [PATCH] Improved percolator's random candidate query duel test and fixed bugs that were exposed by this: * Duplicates query leafs were not detected in a multi level boolean query * Tracking fields for numeric range queries did not work properly. * The sorting that was used to find the less restrictive clauses in disjunction query did not work too. --- .../percolator/QueryAnalyzer.java | 104 ++++++---- .../percolator/CandidateQueryTests.java | 191 +++++++++++++----- .../PercolatorFieldMapperTests.java | 2 +- 3 files changed, 199 insertions(+), 98 deletions(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index 618dafc6e94..f07382e3dba 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -54,6 +54,7 @@ import org.elasticsearch.index.search.ESToParentBlockJoinQuery; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -61,6 +62,7 @@ import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.function.BiFunction; +import java.util.stream.Collectors; import static java.util.stream.Collectors.toSet; @@ -366,36 +368,38 @@ final class QueryAnalyzer { Set extractions = new HashSet<>(); Set seenRangeFields = new HashSet<>(); for (Result result : results) { - QueryExtraction[] t = result.extractions.toArray(new QueryExtraction[1]); - if (result.extractions.size() == 1 && t[0].range != null) { - // In case of range queries each extraction does not simply increment the minimum_should_match - // for that percolator query like for a term based extraction, so that can lead to more false - // positives for percolator queries with range queries than term based queries. - // The is because the way number fields are extracted from the document to be percolated. - // Per field a single range is extracted and if a percolator query has two or more range queries - // on the same field, then the minimum should match can be higher than clauses in the CoveringQuery. - // Therefore right now the minimum should match is incremented once per number field when processing - // the percolator query at index time. - if (seenRangeFields.add(t[0].range.fieldName)) { - msm += 1; - } - } else { - // In case that there are duplicate query extractions we need to be careful with incrementing msm, - // because that could lead to valid matches not becoming candidate matches: - // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3) - // doc: field: val1 val2 val3 - // So lets be protective and decrease the msm: - int resultMsm = result.minimumShouldMatch; - for (QueryExtraction queryExtraction : result.extractions) { - if (extractions.contains(queryExtraction)) { - // To protect against negative msm: - // (sub results could consist out of disjunction and conjunction and - // then we do not know which extraction contributed to msm) - resultMsm = Math.max(0, resultMsm - 1); + // In case that there are duplicate query extractions we need to be careful with incrementing msm, + // because that could lead to valid matches not becoming candidate matches: + // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3) + // doc: field: val1 val2 val3 + // So lets be protective and decrease the msm: + int resultMsm = result.minimumShouldMatch; + for (QueryExtraction queryExtraction : result.extractions) { + if (queryExtraction.range != null) { + // In case of range queries each extraction does not simply increment the minimum_should_match + // for that percolator query like for a term based extraction, so that can lead to more false + // positives for percolator queries with range queries than term based queries. + // The is because the way number fields are extracted from the document to be percolated. + // Per field a single range is extracted and if a percolator query has two or more range queries + // on the same field, then the minimum should match can be higher than clauses in the CoveringQuery. + // Therefore right now the minimum should match is incremented once per number field when processing + // the percolator query at index time. + if (seenRangeFields.add(queryExtraction.range.fieldName)) { + resultMsm = 1; + } else { + resultMsm = 0; } } - msm += resultMsm; + + if (extractions.contains(queryExtraction)) { + // To protect against negative msm: + // (sub results could consist out of disjunction and conjunction and + // then we do not know which extraction contributed to msm) + resultMsm = Math.max(0, resultMsm - 1); + } } + msm += resultMsm; + verified &= result.verified; matchAllDocs &= result.matchAllDocs; extractions.addAll(result.extractions); @@ -518,8 +522,7 @@ final class QueryAnalyzer { private static Result handleDisjunction(List disjunctions, int requiredShouldClauses, boolean otherClauses, Version version) { // Keep track of the msm for each clause: - int[] msmPerClause = new int[disjunctions.size()]; - String[] rangeFieldNames = new String[disjunctions.size()]; + List clauses = new ArrayList<>(disjunctions.size()); boolean verified = otherClauses == false; if (version.before(Version.V_6_1_0)) { verified &= requiredShouldClauses <= 1; @@ -535,17 +538,14 @@ final class QueryAnalyzer { } int resultMsm = subResult.minimumShouldMatch; for (QueryExtraction extraction : subResult.extractions) { - if (terms.contains(extraction)) { - resultMsm = Math.max(1, resultMsm - 1); + if (terms.add(extraction) == false) { + resultMsm = Math.max(0, resultMsm - 1); } } - msmPerClause[i] = resultMsm; - terms.addAll(subResult.extractions); - - QueryExtraction[] t = subResult.extractions.toArray(new QueryExtraction[1]); - if (subResult.extractions.size() == 1 && t[0].range != null) { - rangeFieldNames[i] = t[0].range.fieldName; - } + clauses.add(new DisjunctionClause(resultMsm, subResult.extractions.stream() + .filter(extraction -> extraction.range != null) + .map(extraction -> extraction.range.fieldName) + .collect(toSet()))); } boolean matchAllDocs = numMatchAllClauses > 0 && numMatchAllClauses >= requiredShouldClauses; @@ -554,15 +554,20 @@ final class QueryAnalyzer { Set seenRangeFields = new HashSet<>(); // Figure out what the combined msm is for this disjunction: // (sum the lowest required clauses, otherwise we're too strict and queries may not match) - Arrays.sort(msmPerClause); - int limit = Math.min(msmPerClause.length, Math.max(1, requiredShouldClauses)); + clauses = clauses.stream() + .filter(o -> o.msm > 0) + .sorted(Comparator.comparingInt(o -> o.msm)) + .collect(Collectors.toList()); + int limit = Math.min(clauses.size(), Math.max(1, requiredShouldClauses)); for (int i = 0; i < limit; i++) { - if (rangeFieldNames[i] != null) { - if (seenRangeFields.add(rangeFieldNames[i])) { - msm += 1; + if (clauses.get(i).rangeFieldNames.isEmpty() == false) { + for (String rangeField: clauses.get(i).rangeFieldNames) { + if (seenRangeFields.add(rangeField)) { + msm += 1; + } } } else { - msm += msmPerClause[i]; + msm += clauses.get(i).msm; } } } else { @@ -575,6 +580,17 @@ final class QueryAnalyzer { } } + static class DisjunctionClause { + + final int msm; + final Set rangeFieldNames; + + DisjunctionClause(int msm, Set rangeFieldNames) { + this.msm = msm; + this.rangeFieldNames = rangeFieldNames; + } + } + static Set selectBestExtraction(Set extractions1, Set extractions2) { assert extractions1 != null || extractions2 != null; if (extractions1 == null) { diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java index 6af28b2bbfc..d622af232f2 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java @@ -28,6 +28,7 @@ import org.apache.lucene.document.HalfFloatPoint; import org.apache.lucene.document.InetAddressPoint; import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; @@ -36,7 +37,9 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queries.BlendedTermQuery; @@ -44,12 +47,10 @@ import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; -import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.CoveringQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.FilterScorer; import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -76,11 +77,14 @@ import org.elasticsearch.common.CheckedFunction; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.geo.ShapeRelation; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.NumberFieldMapper; import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; @@ -166,59 +170,53 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { public void testDuel() throws Exception { int numFields = randomIntBetween(1, 3); - Map> content = new HashMap<>(); + Map> stringContent = new HashMap<>(); for (int i = 0; i < numFields; i++) { int numTokens = randomIntBetween(1, 64); List values = new ArrayList<>(); for (int j = 0; j < numTokens; j++) { values.add(randomAlphaOfLength(8)); } - content.put("field" + i, values); + stringContent.put("field" + i, values); } - List fields = new ArrayList<>(content.keySet()); + List stringFields = new ArrayList<>(stringContent.keySet()); + + int numValues = randomIntBetween(16, 64); + List intValues = new ArrayList<>(numValues); + for (int j = 0; j < numValues; j++) { + intValues.add(randomInt()); + } + Collections.sort(intValues); + + MappedFieldType intFieldType = mapperService.documentMapper("type").mappers() + .getMapper("int_field").fieldType(); List> queryFunctions = new ArrayList<>(); queryFunctions.add(MatchNoDocsQuery::new); queryFunctions.add(MatchAllDocsQuery::new); queryFunctions.add(() -> new TermQuery(new Term("unknown_field", "value"))); - String field1 = randomFrom(fields); - queryFunctions.add(() -> new TermQuery(new Term(field1, randomFrom(content.get(field1))))); - String field2 = randomFrom(fields); - queryFunctions.add(() -> new TermQuery(new Term(field2, randomFrom(content.get(field2))))); - queryFunctions.add(() -> new TermInSetQuery(field1, new BytesRef(randomFrom(content.get(field1))), - new BytesRef(randomFrom(content.get(field1))))); - queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(content.get(field1))), - new BytesRef(randomFrom(content.get(field1))))); - queryFunctions.add(() -> { - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - int numClauses = randomIntBetween(1, 16); - for (int i = 0; i < numClauses; i++) { - if (rarely()) { - if (randomBoolean()) { - Occur occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD)); - builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur); - } else { - String field = randomFrom(fields); - builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), Occur.MUST_NOT); - } - } else { - if (randomBoolean()) { - Occur occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD)); - String field = randomFrom(fields); - builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur); - } else { - builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), Occur.MUST_NOT); - } - } - } - return builder.build(); - }); + String field1 = randomFrom(stringFields); + queryFunctions.add(() -> new TermQuery(new Term(field1, randomFrom(stringContent.get(field1))))); + String field2 = randomFrom(stringFields); + queryFunctions.add(() -> new TermQuery(new Term(field2, randomFrom(stringContent.get(field2))))); + queryFunctions.add(() -> intFieldType.termQuery(randomFrom(intValues), null)); + queryFunctions.add(() -> intFieldType.termsQuery(Arrays.asList(randomFrom(intValues), randomFrom(intValues)), null)); + queryFunctions.add(() -> intFieldType.rangeQuery(intValues.get(4), intValues.get(intValues.size() - 4), true, + true, ShapeRelation.WITHIN, null, null, null)); + queryFunctions.add(() -> new TermInSetQuery(field1, new BytesRef(randomFrom(stringContent.get(field1))), + new BytesRef(randomFrom(stringContent.get(field1))))); + queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(stringContent.get(field1))), + new BytesRef(randomFrom(stringContent.get(field1))))); + int numRandomBoolQueries = randomIntBetween(16, 32); + for (int i = 0; i < numRandomBoolQueries; i++) { + queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues)); + } queryFunctions.add(() -> { int numClauses = randomIntBetween(1, 16); List clauses = new ArrayList<>(); for (int i = 0; i < numClauses; i++) { - String field = randomFrom(fields); - clauses.add(new TermQuery(new Term(field, randomFrom(content.get(field))))); + String field = randomFrom(stringFields); + clauses.add(new TermQuery(new Term(field, randomFrom(stringContent.get(field))))); } return new DisjunctionMaxQuery(clauses, 0.01f); }); @@ -237,14 +235,75 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { shardSearcher.setQueryCache(null); Document document = new Document(); - for (Map.Entry> entry : content.entrySet()) { + for (Map.Entry> entry : stringContent.entrySet()) { String value = entry.getValue().stream().collect(Collectors.joining(" ")); document.add(new TextField(entry.getKey(), value, Field.Store.NO)); } + for (Integer intValue : intValues) { + List numberFields = + NumberFieldMapper.NumberType.INTEGER.createFields("int_field", intValue, true, true, false); + for (Field numberField : numberFields) { + document.add(numberField); + } + } MemoryIndex memoryIndex = MemoryIndex.fromDocument(document, new WhitespaceAnalyzer()); duelRun(queryStore, memoryIndex, shardSearcher); } + private BooleanQuery createRandomBooleanQuery(int depth, List fields, Map> content, + MappedFieldType intFieldType, List intValues) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + int numClauses = randomIntBetween(1, 16); + int numShouldClauses = 0; + boolean onlyShouldClauses = rarely(); + for (int i = 0; i < numClauses; i++) { + Occur occur; + if (onlyShouldClauses) { + occur = Occur.SHOULD; + if (randomBoolean()) { + String field = randomFrom(fields); + builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur); + } else { + builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur); + } + } else if (rarely() && depth <= 3) { + occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD)); + builder.add(createRandomBooleanQuery(depth + 1, fields, content, intFieldType, intValues), occur); + } else if (rarely()) { + if (randomBoolean()) { + occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD)); + if (randomBoolean()) { + builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur); + } else { + builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur); + } + } else if (randomBoolean()) { + String field = randomFrom(fields); + builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur = Occur.MUST_NOT); + } else { + builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur = Occur.MUST_NOT); + } + } else { + if (randomBoolean()) { + occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD)); + if (randomBoolean()) { + String field = randomFrom(fields); + builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur); + } else { + builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur); + } + } else { + builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur = Occur.MUST_NOT); + } + } + if (occur == Occur.SHOULD) { + numShouldClauses++; + } + } + builder.setMinimumNumberShouldMatch(numShouldClauses); + return builder.build(); + } + public void testDuelIdBased() throws Exception { List> queryFunctions = new ArrayList<>(); queryFunctions.add((id) -> new PrefixQuery(new Term("field", id))); @@ -766,11 +825,11 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { Query percolateQuery = fieldType.percolateQuery("_name", queryStore, Collections.singletonList(new BytesArray("{}")), percolateSearcher, Version.CURRENT); Query query = requireScore ? percolateQuery : new ConstantScoreQuery(percolateQuery); - TopDocs topDocs = shardSearcher.search(query, 10); + TopDocs topDocs = shardSearcher.search(query, 100); Query controlQuery = new ControlQuery(memoryIndex, queryStore); controlQuery = requireScore ? controlQuery : new ConstantScoreQuery(controlQuery); - TopDocs controlTopDocs = shardSearcher.search(controlQuery, 10); + TopDocs controlTopDocs = shardSearcher.search(controlQuery, 100); try { assertThat(topDocs.totalHits, equalTo(controlTopDocs.totalHits)); @@ -793,22 +852,39 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { logger.error("controlTopDocs.scoreDocs.length={}", controlTopDocs.scoreDocs.length); for (int i = 0; i < topDocs.scoreDocs.length; i++) { - logger.error("topDocs.scoreDocs[j].doc={}", topDocs.scoreDocs[i].doc); - logger.error("topDocs.scoreDocs[j].score={}", topDocs.scoreDocs[i].score); + logger.error("topDocs.scoreDocs[{}].doc={}", i, topDocs.scoreDocs[i].doc); + logger.error("topDocs.scoreDocs[{}].score={}", i, topDocs.scoreDocs[i].score); } for (int i = 0; i < controlTopDocs.scoreDocs.length; i++) { - logger.error("controlTopDocs.scoreDocs[j].doc={}", controlTopDocs.scoreDocs[i].doc); - logger.error("controlTopDocs.scoreDocs[j].score={}", controlTopDocs.scoreDocs[i].score); + logger.error("controlTopDocs.scoreDocs[{}].doc={}", i, controlTopDocs.scoreDocs[i].doc); + logger.error("controlTopDocs.scoreDocs[{}].score={}", i, controlTopDocs.scoreDocs[i].score); + + // Additional stored information that is useful when debugging: + String queryToString = shardSearcher.doc(controlTopDocs.scoreDocs[i].doc).get("query_to_string"); + logger.error("topDocs.scoreDocs[{}].query_to_string={}", i, queryToString); + + NumericDocValues numericValues = + MultiDocValues.getNumericValues(shardSearcher.getIndexReader(), fieldType.minimumShouldMatchField.name()); + boolean exact = numericValues.advanceExact(controlTopDocs.scoreDocs[i].doc); + if (exact) { + logger.error("controlTopDocs.scoreDocs[{}].minimum_should_match_field={}", i, numericValues.longValue()); + } else { + // Some queries do not have a msm field. (e.g. unsupported queries) + logger.error("controlTopDocs.scoreDocs[{}].minimum_should_match_field=[NO_VALUE]", i); + } } throw ae; } } - private void addQuery(Query query, List docs) throws IOException { + private void addQuery(Query query, List docs) { ParseContext.InternalParseContext parseContext = new ParseContext.InternalParseContext(Settings.EMPTY, mapperService.documentMapperParser(), documentMapper, null, null); fieldMapper.processQuery(query, parseContext); - docs.add(parseContext.doc()); + ParseContext.Document queryDocument = parseContext.doc(); + // Add to string representation of the query to make debugging easier: + queryDocument.add(new StoredField("query_to_string", query.toString())); + docs.add(queryDocument); queries.add(query); } @@ -865,8 +941,6 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { final IndexSearcher percolatorIndexSearcher = memoryIndex.createSearcher(); return new Weight(this) { - float _score; - @Override public void extractTerms(Set terms) {} @@ -889,6 +963,7 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { @Override public Scorer scorer(LeafReaderContext context) throws IOException { + float _score[] = new float[]{boost}; DocIdSetIterator allDocs = DocIdSetIterator.all(context.reader().maxDoc()); CheckedFunction leaf = queryStore.getQueries(context); FilteredDocIdSetIterator memoryIndexIterator = new FilteredDocIdSetIterator(allDocs) { @@ -900,7 +975,7 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { TopDocs topDocs = percolatorIndexSearcher.search(query, 1); if (topDocs.totalHits > 0) { if (needsScores) { - _score = topDocs.scoreDocs[0].score; + _score[0] = topDocs.scoreDocs[0].score; } return true; } else { @@ -911,11 +986,21 @@ public class CandidateQueryTests extends ESSingleNodeTestCase { } } }; - return new FilterScorer(new ConstantScoreScorer(this, 1f, memoryIndexIterator)) { + return new Scorer(this) { + + @Override + public int docID() { + return memoryIndexIterator.docID(); + } + + @Override + public DocIdSetIterator iterator() { + return memoryIndexIterator; + } @Override public float score() throws IOException { - return _score; + return _score[0]; } }; } diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java index 843dbcae90f..1bd0dff132d 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java @@ -922,7 +922,7 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase { assertThat(values.get(3), equalTo("field\0value4")); assertThat(values.get(4), equalTo("field\0value5")); msm = doc.rootDoc().getFields(fieldType.minimumShouldMatchField.name())[0].numericValue().intValue(); - assertThat(msm, equalTo(3)); + assertThat(msm, equalTo(1)); } private static byte[] subByteArray(byte[] source, int offset, int length) {