SQL: fix multi full-text functions usage with aggregate functions (#47444)

* Skip functions involving full-text predicates when replacing multiple
aggregate functions with "stats" or "matrix_stats" aggregations.

(cherry picked from commit bb14ba83128dfb7a70f825ea08b1524072fb9ad0)
This commit is contained in:
Andrei Stefan 2019-10-04 16:05:55 +03:00 committed by Andrei Stefan
parent 2b16d7bcf8
commit a46f312ded
3 changed files with 144 additions and 2 deletions

View File

@ -141,3 +141,64 @@ SELECT emp_no, first_name, SCORE() as s FROM test_emp WHERE MATCH(first_name, 'E
emp_no:i | first_name:s | s:f emp_no:i | first_name:s | s:f
10076 |Erez |4.1053944 10076 |Erez |4.1053944
; ;
//
// Mixture of Aggs that triggers promotion of aggs to stats using multi full-text filtering
//
multiAggWithCountMatchAndQuery
SELECT MIN(salary) min, MAX(salary) max, gender g, COUNT(*) c FROM "test_emp" WHERE languages > 0 AND (MATCH(gender, 'F') OR MATCH(gender, 'M')) AND QUERY('M*', 'default_field=last_name;lenient=true', 'fuzzy_rewrite=scoring_boolean') GROUP BY g HAVING max > 50000 ORDER BY gender;
min:i | max:i | g:s | c:l
---------------+---------------+---------------+---------------
37112 |69904 |F |3
32568 |70011 |M |8
;
multiAggWithCountAndMultiMatch
SELECT MIN(salary) min, MAX(salary) max, gender g, COUNT(*) c FROM "test_emp" WHERE MATCH(gender, 'F') OR MATCH(gender, 'M') GROUP BY g HAVING max > 50000 ORDER BY gender;
min:i | max:i | g:s | c:l
---------------+---------------+---------------+---------------
25976 |74572 |F |33
25945 |74999 |M |57
;
multiAggWithMultiMatchOrderByCount
SELECT MIN(salary) min, MAX(salary) max, ROUND(AVG(salary)) avg, gender g, COUNT(*) c FROM "test_emp" WHERE MATCH(gender, 'F') OR MATCH('first_name^3,last_name^5', 'geo hir', 'fuzziness=2;operator=or') GROUP BY g ORDER BY c DESC;
min:i | max:i | avg:d | g:s | c:l
---------------+---------------+---------------+---------------+---------------
25976 |74572 |50491 |F |33
32568 |32568 |32568 |M |1
;
multiAggWithMultiMatchOrderByCountAndSimpleCondition
SELECT MIN(salary) min, MAX(salary) max, ROUND(AVG(salary)) avg, gender g, COUNT(*) c FROM "test_emp" WHERE (MATCH(gender, 'F') AND languages > 4) OR MATCH('first_name^3,last_name^5', 'geo hir', 'fuzziness=2;operator=or') GROUP BY g ORDER BY c DESC;
min:i | max:i | avg:d | g:s | c:l
---------------+---------------+---------------+---------------+---------------
32272 |66817 |48081 |F |11
32568 |32568 |32568 |M |1
;
multiAggWithPercentileAndMultiQuery
SELECT languages, PERCENTILE(salary, 95) "95th", ROUND(PERCENTILE_RANK(salary, 65000)) AS rank, MAX(salary), MIN(salary), COUNT(*) c FROM test_emp WHERE QUERY('A*','default_field=first_name') OR QUERY('B*', 'default_field=first_name') OR languages IS NULL GROUP BY languages;
languages:bt | 95th:d | rank:d | MAX(salary):i | MIN(salary):i | c:l
---------------+---------------+---------------+---------------+---------------+---------------
null |74999 |74 |74999 |28336 |10
2 |44307 |100 |44307 |29175 |3
3 |65030 |100 |65030 |38376 |4
5 |66817 |100 |66817 |37137 |4
;
multiAggWithStatsAndMatrixStatsAndMultiQuery
SELECT languages, KURTOSIS(salary) k, SKEWNESS(salary) s, MAX(salary), MIN(salary), COUNT(*) c FROM test_emp WHERE QUERY('A*','default_field=first_name') OR QUERY('B*', 'default_field=first_name') OR languages IS NULL GROUP BY languages;
languages:bt | k:d | s:d | MAX(salary):i | MIN(salary):i | c:l
---------------+------------------+-------------------+---------------+---------------+---------------
null |1.9161749939033146|0.1480828817161133 |74999 |28336 |10
2 |1.5000000000000002|0.484743245141609 |44307 |29175 |3
3 |1.0732551278666582|0.05483979801873433|65030 |38376 |4
5 |1.322529094661261 |0.24501477738153868|66817 |37137 |4
;

View File

@ -52,6 +52,7 @@ import org.elasticsearch.xpack.sql.expression.predicate.conditional.ArbitraryCon
import org.elasticsearch.xpack.sql.expression.predicate.conditional.Case; import org.elasticsearch.xpack.sql.expression.predicate.conditional.Case;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.Coalesce; import org.elasticsearch.xpack.sql.expression.predicate.conditional.Coalesce;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.IfConditional; import org.elasticsearch.xpack.sql.expression.predicate.conditional.IfConditional;
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.FullTextPredicate;
import org.elasticsearch.xpack.sql.expression.predicate.logical.And; import org.elasticsearch.xpack.sql.expression.predicate.logical.And;
import org.elasticsearch.xpack.sql.expression.predicate.logical.Not; import org.elasticsearch.xpack.sql.expression.predicate.logical.Not;
import org.elasticsearch.xpack.sql.expression.predicate.logical.Or; import org.elasticsearch.xpack.sql.expression.predicate.logical.Or;
@ -488,11 +489,11 @@ public class Optimizer extends RuleExecutor<LogicalPlan> {
} }
} }
else if (e instanceof ScalarFunction) { else if (e instanceof ScalarFunction && false == Expressions.anyMatch(e.children(), c -> c instanceof FullTextPredicate)) {
ScalarFunction sf = (ScalarFunction) e; ScalarFunction sf = (ScalarFunction) e;
// if it's a unseen function check if the function children/arguments refers to any of the promoted aggs // if it's a unseen function check if the function children/arguments refers to any of the promoted aggs
if (!updatedScalarAttrs.containsKey(sf.functionId()) && e.anyMatch(c -> { if (newAggIds.isEmpty() == false && !updatedScalarAttrs.containsKey(sf.functionId()) && e.anyMatch(c -> {
Attribute a = Expressions.attribute(c); Attribute a = Expressions.attribute(c);
if (a instanceof FunctionAttribute) { if (a instanceof FunctionAttribute) {
return newAggIds.contains(((FunctionAttribute) a).functionId()); return newAggIds.contains(((FunctionAttribute) a).functionId());

View File

@ -23,10 +23,17 @@ import org.elasticsearch.xpack.sql.expression.function.Function;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunction; import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunction;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Avg; import org.elasticsearch.xpack.sql.expression.function.aggregate.Avg;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Count; import org.elasticsearch.xpack.sql.expression.function.aggregate.Count;
import org.elasticsearch.xpack.sql.expression.function.aggregate.ExtendedStats;
import org.elasticsearch.xpack.sql.expression.function.aggregate.First; import org.elasticsearch.xpack.sql.expression.function.aggregate.First;
import org.elasticsearch.xpack.sql.expression.function.aggregate.InnerAggregate;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Last; import org.elasticsearch.xpack.sql.expression.function.aggregate.Last;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Max; import org.elasticsearch.xpack.sql.expression.function.aggregate.Max;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Min; import org.elasticsearch.xpack.sql.expression.function.aggregate.Min;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Stats;
import org.elasticsearch.xpack.sql.expression.function.aggregate.StddevPop;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Sum;
import org.elasticsearch.xpack.sql.expression.function.aggregate.SumOfSquares;
import org.elasticsearch.xpack.sql.expression.function.aggregate.VarPop;
import org.elasticsearch.xpack.sql.expression.function.scalar.Cast; import org.elasticsearch.xpack.sql.expression.function.scalar.Cast;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DayName; import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DayName;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DayOfMonth; import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DayOfMonth;
@ -57,7 +64,12 @@ import org.elasticsearch.xpack.sql.expression.predicate.conditional.IfNull;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.Iif; import org.elasticsearch.xpack.sql.expression.predicate.conditional.Iif;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.Least; import org.elasticsearch.xpack.sql.expression.predicate.conditional.Least;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.NullIf; import org.elasticsearch.xpack.sql.expression.predicate.conditional.NullIf;
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.FullTextPredicate;
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.MatchQueryPredicate;
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.MultiMatchQueryPredicate;
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.StringQueryPredicate;
import org.elasticsearch.xpack.sql.expression.predicate.logical.And; import org.elasticsearch.xpack.sql.expression.predicate.logical.And;
import org.elasticsearch.xpack.sql.expression.predicate.logical.BinaryLogic;
import org.elasticsearch.xpack.sql.expression.predicate.logical.Not; import org.elasticsearch.xpack.sql.expression.predicate.logical.Not;
import org.elasticsearch.xpack.sql.expression.predicate.logical.Or; import org.elasticsearch.xpack.sql.expression.predicate.logical.Or;
import org.elasticsearch.xpack.sql.expression.predicate.nulls.IsNotNull; import org.elasticsearch.xpack.sql.expression.predicate.nulls.IsNotNull;
@ -87,6 +99,8 @@ import org.elasticsearch.xpack.sql.optimizer.Optimizer.ConstantFolding;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.FoldNull; import org.elasticsearch.xpack.sql.optimizer.Optimizer.FoldNull;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.PropagateEquals; import org.elasticsearch.xpack.sql.optimizer.Optimizer.PropagateEquals;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.PruneDuplicateFunctions; import org.elasticsearch.xpack.sql.optimizer.Optimizer.PruneDuplicateFunctions;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceAggsWithExtendedStats;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceAggsWithStats;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceFoldableAttributes; import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceFoldableAttributes;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceMinMaxWithTopHits; import org.elasticsearch.xpack.sql.optimizer.Optimizer.ReplaceMinMaxWithTopHits;
import org.elasticsearch.xpack.sql.optimizer.Optimizer.RewritePivot; import org.elasticsearch.xpack.sql.optimizer.Optimizer.RewritePivot;
@ -1522,4 +1536,70 @@ public class OptimizerTests extends ESTestCase {
assertEquals(column, in.value()); assertEquals(column, in.value());
assertEquals(Arrays.asList(L(1), L(2)), in.list()); assertEquals(Arrays.asList(L(1), L(2)), in.list());
} }
/**
* Test queries like SELECT MIN(agg_field), MAX(agg_field) FROM table WHERE MATCH(match_field,'A') AND/OR QUERY('match_field:A')
* or SELECT STDDEV_POP(agg_field), VAR_POP(agg_field) FROM table WHERE MATCH(match_field,'A') AND/OR QUERY('match_field:A')
*/
public void testAggregatesPromoteToStats_WithFullTextPredicatesConditions() {
FieldAttribute matchField = new FieldAttribute(EMPTY, "match_field", new EsField("match_field", DataType.TEXT, emptyMap(), true));
FieldAttribute aggField = new FieldAttribute(EMPTY, "agg_field", new EsField("agg_field", DataType.INTEGER, emptyMap(), true));
FullTextPredicate matchPredicate = new MatchQueryPredicate(EMPTY, matchField, "A", StringUtils.EMPTY);
FullTextPredicate multiMatchPredicate = new MultiMatchQueryPredicate(EMPTY, "match_field", "A", StringUtils.EMPTY);
FullTextPredicate stringQueryPredicate = new StringQueryPredicate(EMPTY, "match_field:A", StringUtils.EMPTY);
List<FullTextPredicate> predicates = Arrays.asList(matchPredicate, multiMatchPredicate, stringQueryPredicate);
FullTextPredicate left = randomFrom(predicates);
FullTextPredicate right = randomFrom(predicates);
BinaryLogic or = new Or(EMPTY, left, right);
BinaryLogic and = new And(EMPTY, left, right);
BinaryLogic condition = randomFrom(or, and);
Filter filter = new Filter(EMPTY, FROM(), condition);
List<AggregateFunction> aggregates;
boolean isSimpleStats = randomBoolean();
if (isSimpleStats) {
aggregates = Arrays.asList(new Avg(EMPTY, aggField), new Sum(EMPTY, aggField), new Min(EMPTY, aggField),
new Max(EMPTY, aggField));
} else {
aggregates = Arrays.asList(new StddevPop(EMPTY, aggField), new SumOfSquares(EMPTY, aggField), new VarPop(EMPTY, aggField));
}
AggregateFunction firstAggregate = randomFrom(aggregates);
AggregateFunction secondAggregate = randomValueOtherThan(firstAggregate, () -> randomFrom(aggregates));
Aggregate aggregatePlan = new Aggregate(EMPTY, filter, Collections.singletonList(matchField),
Arrays.asList(firstAggregate, secondAggregate));
LogicalPlan result;
if (isSimpleStats) {
result = new ReplaceAggsWithStats().apply(aggregatePlan);
} else {
result = new ReplaceAggsWithExtendedStats().apply(aggregatePlan);
}
assertTrue(result instanceof Aggregate);
Aggregate resultAgg = (Aggregate) result;
assertEquals(2, resultAgg.aggregates().size());
assertTrue(resultAgg.aggregates().get(0) instanceof InnerAggregate);
assertTrue(resultAgg.aggregates().get(1) instanceof InnerAggregate);
InnerAggregate resultFirstAgg = (InnerAggregate) resultAgg.aggregates().get(0);
InnerAggregate resultSecondAgg = (InnerAggregate) resultAgg.aggregates().get(1);
assertEquals(resultFirstAgg.inner(), firstAggregate);
assertEquals(resultSecondAgg.inner(), secondAggregate);
if (isSimpleStats) {
assertTrue(resultFirstAgg.outer() instanceof Stats);
assertTrue(resultSecondAgg.outer() instanceof Stats);
assertEquals(((Stats) resultFirstAgg.outer()).field(), aggField);
assertEquals(((Stats) resultSecondAgg.outer()).field(), aggField);
} else {
assertTrue(resultFirstAgg.outer() instanceof ExtendedStats);
assertTrue(resultSecondAgg.outer() instanceof ExtendedStats);
assertEquals(((ExtendedStats) resultFirstAgg.outer()).field(), aggField);
assertEquals(((ExtendedStats) resultSecondAgg.outer()).field(), aggField);
}
assertTrue(resultAgg.child() instanceof Filter);
assertEquals(resultAgg.child(), filter);
}
} }