From 64d638a386fd752056ea18193e96b7cff81de409 Mon Sep 17 00:00:00 2001 From: Slim Bouguerra Date: Tue, 4 Aug 2015 09:07:47 -0500 Subject: [PATCH] optimize makeMatcher --- docs/content/querying/aggregations.md | 2 +- docs/content/querying/filters.md | 5 +- .../column/SimpleDictionaryEncodedColumn.java | 1 + .../segment/filter/ExtractionFilter.java | 42 +- .../query/groupby/GroupByQueryRunnerTest.java | 510 ++++++++++-------- 5 files changed, 312 insertions(+), 248 deletions(-) diff --git a/docs/content/querying/aggregations.md b/docs/content/querying/aggregations.md index 4c487f4fda5..4ccd3c774ea 100644 --- a/docs/content/querying/aggregations.md +++ b/docs/content/querying/aggregations.md @@ -185,7 +185,7 @@ A filtered aggregator wraps any given aggregator, but only aggregates the values This makes it possible to compute the results of a filtered and an unfiltered aggregation simultaneously, without having to issue multiple queries, and use both results as part of post-aggregations. -*Limitations:* The filtered aggregator currently only supports 'or', 'and', 'selector' and 'not' filters, i.e. matching one or multiple dimensions against a single value. +*Limitations:* The filtered aggregator currently only supports 'or', 'and', 'selector', 'not' and 'Extraction' filters, i.e. matching one or multiple dimensions against a single value. *Note:* If only the filtered results are required, consider putting the filter on the query itself, which will be much faster since it does not require scanning all the data. diff --git a/docs/content/querying/filters.md b/docs/content/querying/filters.md index 9191af86127..bea6b31abee 100644 --- a/docs/content/querying/filters.md +++ b/docs/content/querying/filters.md @@ -105,10 +105,7 @@ The following matches dimension values in `[product_1, product_3, product_5]` fo "product_5": "bar_1", "product_3": "bar_1" } - }, - "replaceMissingValueWith": "", - "retainMissingValue": false, - "injective": false + } } } } diff --git a/processing/src/main/java/io/druid/segment/column/SimpleDictionaryEncodedColumn.java b/processing/src/main/java/io/druid/segment/column/SimpleDictionaryEncodedColumn.java index 9a7419f8929..0eb913eee5d 100644 --- a/processing/src/main/java/io/druid/segment/column/SimpleDictionaryEncodedColumn.java +++ b/processing/src/main/java/io/druid/segment/column/SimpleDictionaryEncodedColumn.java @@ -72,6 +72,7 @@ public class SimpleDictionaryEncodedColumn @Override public String lookupName(int id) { + //Empty to Null will ensure that null and empty are equivalent for extraction function return Strings.emptyToNull(cachedLookups.get(id)); } diff --git a/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java b/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java index 98e16ea9095..443def751d7 100644 --- a/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java +++ b/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java @@ -31,6 +31,8 @@ import io.druid.segment.DimensionSelector; import io.druid.segment.data.Indexed; import io.druid.segment.data.IndexedInts; +import java.util.BitSet; +import java.util.Iterator; import java.util.List; /** @@ -50,8 +52,36 @@ public class ExtractionFilter implements Filter private List makeFilters(BitmapIndexSelector selector) { - final Indexed allDimVals = selector.getDimensionValues(dimension); + Indexed allDimVals = selector.getDimensionValues(dimension); final List filters = Lists.newArrayList(); + if (allDimVals == null) { + allDimVals = new Indexed() + { + @Override + public Iterator iterator() + { + return null; + } + + @Override + public Class getClazz() + { + return null; + } + + @Override + public int size() { return 1; } + + @Override + public String get(int index) { return null;} + + @Override + public int indexOf(String value) + { + return 0; + } + }; + } if (allDimVals != null) { for (int i = 0; i < allDimVals.size(); i++) { String dimVal = allDimVals.get(i); @@ -59,8 +89,6 @@ public class ExtractionFilter implements Filter filters.add(new SelectorFilter(dimension, dimVal)); } } - } else if (value.equals(Strings.nullToEmpty(fn.apply(null)))) { - filters.add(new SelectorFilter(dimension, null)); } return filters; } @@ -98,6 +126,12 @@ public class ExtractionFilter implements Filter if (dimensionSelector == null) { return new BooleanValueMatcher(value.equals(Strings.nullToEmpty(fn.apply(null)))); } else { + final BitSet bitSetOfIds = new BitSet(dimensionSelector.getValueCardinality()); + for (int i = 0; i < dimensionSelector.getValueCardinality(); i++) { + if (value.equals(Strings.nullToEmpty(fn.apply(dimensionSelector.lookupName(i))))) { + bitSetOfIds.set(i); + } + } return new ValueMatcher() { @Override @@ -106,7 +140,7 @@ public class ExtractionFilter implements Filter final IndexedInts row = dimensionSelector.getRow(); final int size = row.size(); for (int i = 0; i < size; ++i) { - if (value.equals(Strings.nullToEmpty(fn.apply(dimensionSelector.lookupName(row.get(i)))))) { + if (bitSetOfIds.get(row.get(i))) { return true; } } diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java index 8b00079e923..fa5f99064e2 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java @@ -47,6 +47,7 @@ import io.druid.query.TestQueryRunners; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.DoubleMaxAggregatorFactory; import io.druid.query.aggregation.DoubleSumAggregatorFactory; +import io.druid.query.aggregation.FilteredAggregatorFactory; import io.druid.query.aggregation.JavaScriptAggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; import io.druid.query.aggregation.PostAggregator; @@ -245,243 +246,6 @@ public class GroupByQueryRunnerTest TestHelper.assertExpectedObjects(expectedResults, results, ""); } - @Test - public void testGroupByWithExtractionDimFilterOptimazitionWithEmptyResult() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("automotive", "automotive0"); - extractionMap.put("business", "business0"); - extractionMap.put("entertainment", "entertainment0"); - extractionMap.put("health", "health0"); - extractionMap.put("mezzanine", "mezzanine0"); - extractionMap.put("news", "news0"); - extractionMap.put("premium", "premium0"); - extractionMap.put("technology", "technology0"); - extractionMap.put("travel", "travel0"); - - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - - List dimFilters = Lists.newArrayList( - new ExtractionDimFilter("quality", "Missing_value", lookupExtractionFn, null), - new ExtractionDimFilter("quality", "business0", lookupExtractionFn, null), - new SelectorDimFilter("quality", "entertainment"), - new SelectorDimFilter("quality", "health"), - new ExtractionDimFilter("quality", "mezzanine0", lookupExtractionFn, null), - new ExtractionDimFilter("quality", "news0", lookupExtractionFn, null), - new SelectorDimFilter("quality", "premium"), - new SelectorDimFilter("quality", "technology"), - new SelectorDimFilter("quality", "travel") - ); - - - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index"))) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(Druids.newOrDimFilterBuilder().fields(dimFilters).build()) - .build(); - List expectedResults = Arrays.asList( - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L), - - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L)); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - - } - - @Test - public void testGroupByWithExtractionDimFilterOptimazitionOneToOne() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("automotive", "automotive0"); - extractionMap.put("business", "business0"); - extractionMap.put("entertainment", "entertainment0"); - extractionMap.put("health", "health0"); - extractionMap.put("mezzanine", "mezzanine0"); - extractionMap.put("news", "news0"); - extractionMap.put("premium", "premium0"); - extractionMap.put("technology", "technology0"); - extractionMap.put("travel", "travel0"); - - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - - List dimFilters = Lists.newArrayList( - new ExtractionDimFilter("quality", "automotive0", lookupExtractionFn, null), - new ExtractionDimFilter("quality", "business0", lookupExtractionFn, null), - new SelectorDimFilter("quality", "entertainment"), - new SelectorDimFilter("quality", "health"), - new ExtractionDimFilter("quality", "mezzanine0", lookupExtractionFn, null), - new ExtractionDimFilter("quality", "news0", lookupExtractionFn, null), - new SelectorDimFilter("quality", "premium"), - new SelectorDimFilter("quality", "technology"), - new SelectorDimFilter("quality", "travel") - ); - - - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index"))) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(Druids.newOrDimFilterBuilder().fields(dimFilters).build()) - .build(); - List expectedResults = Arrays.asList( - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "automotive", "rows", 1L, "idx", 135L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L), - - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "automotive", "rows", 1L, "idx", 147L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L)); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - - } - - @Test - public void testGroupByWithExtractionDimFilterOptimazitionManyToOne() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("mezzanine", "newsANDmezzanine"); - extractionMap.put("news", "newsANDmezzanine"); - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index"))) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(new ExtractionDimFilter("quality", "newsANDmezzanine", lookupExtractionFn, null)) - .build(); - List expectedResults = Arrays.asList( - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L)); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - } - - @Test - public void testGroupByWithExtractionDimFilterCaseNullValue() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("automotive", "automotive0"); - extractionMap.put("business", "business0"); - extractionMap.put("entertainment", "entertainment0"); - extractionMap.put("health", "health0"); - extractionMap.put("mezzanine", ""); - extractionMap.put("news", null); - extractionMap.put("premium", "premium0"); - extractionMap.put("technology", "technology0"); - extractionMap.put("travel", "travel0"); - - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index"))) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(new ExtractionDimFilter("quality", "", lookupExtractionFn, null)) - .build(); - List expectedResults = Arrays.asList( - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L)); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - } - - @Test public void testGroupByWithExtractionDimFilterWhenValueNotThere() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("mezzanine", ""); - extractionMap.put("news", null); - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")) - ) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(new ExtractionDimFilter("quality", "NOT_THERE", lookupExtractionFn, null)).build(); - List expectedResults = Arrays.asList(); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - } - - - @Test public void testGroupByWithExtractionDimFilterNullDims() - { - Map extractionMap = new HashMap<>(); - extractionMap.put("", "EMPTY"); - - MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); - LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); - - GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) - .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) - .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("null_column", "alias"))) - .setAggregatorSpecs( - Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index"))) - .setGranularity(QueryRunnerTestHelper.dayGran) - .setDimFilter(new ExtractionDimFilter("null_column", "EMPTY", lookupExtractionFn, null)).build(); - List expectedResults = Arrays - .asList(GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", null, "rows", 13L, "idx", 6619L), - GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", null, "rows", 13L, "idx", 5827L)); - - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, ""); - } - @Test public void testGroupByWithRebucketRename() { @@ -558,7 +322,6 @@ public class GroupByQueryRunnerTest } - @Test public void testGroupByWithSimpleRenameRetainMissingNonInjective() { @@ -731,7 +494,10 @@ public class GroupByQueryRunnerTest .setDimensions( Lists.newArrayList( new ExtractionDimensionSpec( - "quality", "alias", new LookupExtractionFn(new MapLookupExtractor(map), false, "MISSING", true), null + "quality", + "alias", + new LookupExtractionFn(new MapLookupExtractor(map), false, "MISSING", true), + null ) ) ) @@ -4024,4 +3790,270 @@ public class GroupByQueryRunnerTest TestHelper.assertExpectedObjects(bySegmentResults, theRunner.run(fullQuery, Maps.newHashMap()), ""); exec.shutdownNow(); } + + // Extraction Filters testing + + @Test + public void testGroupByWithExtractionDimFilter() + { + Map extractionMap = new HashMap<>(); + extractionMap.put("automotive", "automotiveAndBusinessAndNewsAndMezzanine"); + extractionMap.put("business", "automotiveAndBusinessAndNewsAndMezzanine"); + extractionMap.put("mezzanine", "automotiveAndBusinessAndNewsAndMezzanine"); + extractionMap.put("news", "automotiveAndBusinessAndNewsAndMezzanine"); + + MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); + LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); + + List dimFilters = Lists.newArrayList( + new ExtractionDimFilter("quality", "automotiveAndBusinessAndNewsAndMezzanine", lookupExtractionFn, null), + new SelectorDimFilter("quality", "entertainment"), + new SelectorDimFilter("quality", "health"), + new SelectorDimFilter("quality", "premium"), + new SelectorDimFilter("quality", "technology"), + new SelectorDimFilter("quality", "travel") + ); + + GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions( + Lists.newArrayList( + new DefaultDimensionSpec( + "quality", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .setDimFilter(Druids.newOrDimFilterBuilder().fields(dimFilters).build()) + .build(); + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "automotive", "rows", 1L, "idx", 135L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L), + + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "automotive", "rows", 1L, "idx", 147L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L) + ); + + Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + + } + + @Test + public void testGroupByWithExtractionDimFilterCaseMappingValueIsNullOrEmpty() + { + Map extractionMap = new HashMap<>(); + extractionMap.put("automotive", "automotive0"); + extractionMap.put("business", "business0"); + extractionMap.put("entertainment", "entertainment0"); + extractionMap.put("health", "health0"); + extractionMap.put("mezzanine", null); + extractionMap.put("news", ""); + extractionMap.put("premium", "premium0"); + extractionMap.put("technology", "technology0"); + extractionMap.put("travel", "travel0"); + + MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); + LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); + GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions( + Lists.newArrayList( + new DefaultDimensionSpec( + "quality", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .setDimFilter(new ExtractionDimFilter("quality", "", lookupExtractionFn, null)) + .build(); + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L) + ); + + Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + @Test + public void testGroupByWithExtractionDimFilterWhenSearchValueNotInTheMap() + { + Map extractionMap = new HashMap<>(); + MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); + LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); + + GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions( + Lists.newArrayList( + new DefaultDimensionSpec( + "quality", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .setDimFilter( + new ExtractionDimFilter( + "quality", + "NOT_THERE", + lookupExtractionFn, + null + ) + ).build(); + List expectedResults = Arrays.asList(); + + Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + + @Test + public void testGroupByWithExtractionDimFilterKeyisNull() + { + Map extractionMap = new HashMap<>(); + extractionMap.put("", "NULLorEMPTY"); + + MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); + LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true); + + GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions( + Lists.newArrayList( + new DefaultDimensionSpec( + "null_column", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .setDimFilter( + new ExtractionDimFilter( + "null_column", + "NULLorEMPTY", + lookupExtractionFn, + null + ) + ).build(); + List expectedResults = Arrays + .asList( + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", null, "rows", 13L, "idx", 6619L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", null, "rows", 13L, "idx", 5827L) + ); + + Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + @Test + public void testGroupByWithAggregatorFilterAndExtractionFunction() + { + Map extractionMap = new HashMap<>(); + extractionMap.put("automotive", "automotive0"); + extractionMap.put("business", "business0"); + extractionMap.put("entertainment", "entertainment0"); + extractionMap.put("health", "health0"); + extractionMap.put("mezzanine", "mezzanineANDnews"); + extractionMap.put("news", "mezzanineANDnews"); + extractionMap.put("premium", "premium0"); + extractionMap.put("technology", "technology0"); + extractionMap.put("travel", "travel0"); + + MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap); + LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, "missing", true); + DimFilter filter = new ExtractionDimFilter("quality","mezzanineANDnews",lookupExtractionFn,null); + GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions( + Lists.newArrayList( + new DefaultDimensionSpec( + "quality", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + new FilteredAggregatorFactory(QueryRunnerTestHelper.rowsCount, filter), + (AggregatorFactory) new FilteredAggregatorFactory( + new LongSumAggregatorFactory( + "idx", + "index" + ), filter + ) + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "automotive", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 0L, "idx", 0L), + + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "automotive", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 0L, "idx", 0L), + GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 0L, "idx", 0L) + ); + + Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + + } + }