Fix extractionFilter by implementing make matcher

Fix getBitmapIndex to consider the case were dim is null
Unit Test for exractionFn with empty result and null_column
UT for TopN queries with Extraction filter
refactor in Extractiuon fileter makematcher for realtime segment and clean code in b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java
fix to make sure that empty string are converted to null
This commit is contained in:
Slim Bouguerra 2015-07-24 09:54:12 -05:00
parent 65654ddbf9
commit dda0790a60
4 changed files with 335 additions and 13 deletions

View File

@ -17,6 +17,7 @@
package io.druid.segment.column;
import com.google.common.base.Strings;
import com.metamx.common.guava.CloseQuietly;
import io.druid.segment.data.CachingIndexed;
import io.druid.segment.data.IndexedInts;
@ -71,7 +72,7 @@ public class SimpleDictionaryEncodedColumn
@Override
public String lookupName(int id)
{
return cachedLookups.get(id);
return Strings.emptyToNull(cachedLookups.get(id));
}
@Override

View File

@ -17,17 +17,19 @@
package io.druid.segment.filter;
import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.metamx.collections.bitmap.ImmutableBitmap;
import com.metamx.collections.bitmap.WrappedImmutableConciseBitmap;
import io.druid.query.extraction.ExtractionFn;
import io.druid.query.filter.BitmapIndexSelector;
import io.druid.query.filter.Filter;
import io.druid.query.filter.ValueMatcher;
import io.druid.query.filter.ValueMatcherFactory;
import io.druid.segment.ColumnSelectorFactory;
import io.druid.segment.DimensionSelector;
import io.druid.segment.data.Indexed;
import it.uniroma3.mat.extendedset.intset.ImmutableConciseSet;
import io.druid.segment.data.IndexedInts;
import java.util.List;
@ -54,15 +56,20 @@ public class ExtractionFilter implements Filter
{
final Indexed<String> allDimVals = selector.getDimensionValues(dimension);
final List<Filter> filters = Lists.newArrayList();
if (allDimVals != null) {
for (int i = 0; i < allDimVals.size(); i++) {
if (allDimVals != null)
{
for (int i = 0; i < allDimVals.size(); i++)
{
String dimVal = allDimVals.get(i);
if (value.equals(fn.apply(dimVal))) {
if (value.equals(fn.apply(dimVal)))
{
filters.add(new SelectorFilter(dimension, dimVal));
}
}
} else if (value.equals(fn.apply(null)))
{
filters.add(new SelectorFilter(dimension, null));
}
return filters;
}
@ -79,13 +86,39 @@ public class ExtractionFilter implements Filter
@Override
public ValueMatcher makeMatcher(ValueMatcherFactory factory)
{
throw new UnsupportedOperationException();
return factory.makeValueMatcher(dimension, new Predicate<String>()
{
@Override public boolean apply(String input)
{
// Assuming that a null/absent/empty dimension are equivalent from the druid perspective
return value.equals(fn.apply(Strings.emptyToNull(input)));
}
});
}
@Override
public ValueMatcher makeMatcher(ColumnSelectorFactory factory)
public ValueMatcher makeMatcher(ColumnSelectorFactory columnSelectorFactory)
{
throw new UnsupportedOperationException();
final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector(dimension, null);
if (dimensionSelector == null) {
return new BooleanValueMatcher(Strings.isNullOrEmpty(fn.apply(value)));
} else {
return new ValueMatcher()
{
@Override
public boolean matches()
{
final IndexedInts row = dimensionSelector.getRow();
final int size = row.size();
for (int i = 0; i < size; ++i) {
if (value.equals(fn.apply(dimensionSelector.lookupName(row.get(i))))) {
return true;
}
}
return false;
}
};
}
}
}

View File

@ -36,6 +36,7 @@ import io.druid.granularity.QueryGranularity;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.query.BySegmentResultValue;
import io.druid.query.BySegmentResultValueClass;
import io.druid.query.Druids;
import io.druid.query.FinalizeResultsQueryRunner;
import io.druid.query.Query;
import io.druid.query.QueryRunner;
@ -63,6 +64,7 @@ import io.druid.query.extraction.MapLookupExtractor;
import io.druid.query.extraction.RegexDimExtractionFn;
import io.druid.query.extraction.TimeFormatExtractionFn;
import io.druid.query.filter.DimFilter;
import io.druid.query.filter.ExtractionDimFilter;
import io.druid.query.filter.JavaScriptDimFilter;
import io.druid.query.filter.OrDimFilter;
import io.druid.query.filter.RegexDimFilter;
@ -243,6 +245,205 @@ public class GroupByQueryRunnerTest
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test
public void testGroupByWithExtractionDimFilterOptimazitionWithEmptyResult()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("automotive", "automotive0");
extractionMap.put("business", "business0");
extractionMap.put("entertainment", "entertainment0");
extractionMap.put("health", "health0");
extractionMap.put("mezzanine", "mezzanine0");
extractionMap.put("news", "news0");
extractionMap.put("premium", "premium0");
extractionMap.put("technology", "technology0");
extractionMap.put("travel", "travel0");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
List<DimFilter> dimFilters = Lists.<DimFilter>newArrayList(
new ExtractionDimFilter("quality", "Missing_value", lookupExtractionFn, null),
new ExtractionDimFilter("quality", "business0", lookupExtractionFn, null),
new SelectorDimFilter("quality", "entertainment"),
new SelectorDimFilter("quality", "health"),
new ExtractionDimFilter("quality", "mezzanine0", lookupExtractionFn, null),
new ExtractionDimFilter("quality", "news0", lookupExtractionFn, null),
new SelectorDimFilter("quality", "premium"),
new SelectorDimFilter("quality", "technology"),
new SelectorDimFilter("quality", "travel")
);
GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource)
.setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird)
.setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("quality", "alias")))
.setAggregatorSpecs(
Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")))
.setGranularity(QueryRunnerTestHelper.dayGran)
.setDimFilter(Druids.newOrDimFilterBuilder().fields(dimFilters).build())
.build();
List<Row> expectedResults = Arrays.asList(
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L));
Iterable<Row> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test
public void testGroupByWithExtractionDimFilterOptimazitionOneToOne()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("automotive", "automotive0");
extractionMap.put("business", "business0");
extractionMap.put("entertainment", "entertainment0");
extractionMap.put("health", "health0");
extractionMap.put("mezzanine", "mezzanine0");
extractionMap.put("news", "news0");
extractionMap.put("premium", "premium0");
extractionMap.put("technology", "technology0");
extractionMap.put("travel", "travel0");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
List<DimFilter> dimFilters = Lists.<DimFilter>newArrayList(
new ExtractionDimFilter("quality", "automotive0", lookupExtractionFn, null),
new ExtractionDimFilter("quality", "business0", lookupExtractionFn, null),
new SelectorDimFilter("quality", "entertainment"),
new SelectorDimFilter("quality", "health"),
new ExtractionDimFilter("quality", "mezzanine0", lookupExtractionFn, null),
new ExtractionDimFilter("quality", "news0", lookupExtractionFn, null),
new SelectorDimFilter("quality", "premium"),
new SelectorDimFilter("quality", "technology"),
new SelectorDimFilter("quality", "travel")
);
GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource)
.setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird)
.setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("quality", "alias")))
.setAggregatorSpecs(
Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")))
.setGranularity(QueryRunnerTestHelper.dayGran)
.setDimFilter(Druids.newOrDimFilterBuilder().fields(dimFilters).build())
.build();
List<Row> expectedResults = Arrays.asList(
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "automotive", "rows", 1L, "idx", 135L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "automotive", "rows", 1L, "idx", 147L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L));
Iterable<Row> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test
public void testGroupByWithExtractionDimFilterOptimazitionManyToOne()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("mezzanine", "newsANDmezzanine");
extractionMap.put("news", "newsANDmezzanine");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource)
.setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird)
.setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("quality", "alias")))
.setAggregatorSpecs(
Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")))
.setGranularity(QueryRunnerTestHelper.dayGran)
.setDimFilter(new ExtractionDimFilter("quality", "newsANDmezzanine", lookupExtractionFn, null))
.build();
List<Row> expectedResults = Arrays.asList(
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L));
Iterable<Row> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test public void testGroupByWithExtractionDimFilterEmptyResult()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("mezzanine", "mezzanine0");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource)
.setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird)
.setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("quality", "alias")))
.setAggregatorSpecs(
Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")))
.setGranularity(QueryRunnerTestHelper.dayGran)
.setDimFilter(new ExtractionDimFilter("quality", "NOT_THERE", lookupExtractionFn, null)).build();
List<Row> expectedResults = Arrays.asList();
Iterable<Row> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test public void testGroupByWithExtractionDimFilterNullDims()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("", "EMPTY");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
GroupByQuery query = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.dataSource)
.setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird)
.setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("null_column", "alias")))
.setAggregatorSpecs(
Arrays.asList(QueryRunnerTestHelper.rowsCount, new LongSumAggregatorFactory("idx", "index")))
.setGranularity(QueryRunnerTestHelper.dayGran)
.setDimFilter(new ExtractionDimFilter("null_column", "EMPTY", lookupExtractionFn, null)).build();
List<Row> expectedResults = Arrays
.asList(GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-01", "alias", null, "rows", 13L, "idx", 6619L),
GroupByQueryRunnerTestHelper.createExpectedRow("2011-04-02", "alias", null, "rows", 13L, "idx", 5827L));
Iterable<Row> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "");
}
@Test
public void testGroupByWithRebucketRename()
{

View File

@ -41,6 +41,7 @@ import io.druid.query.TestQueryRunners;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.query.aggregation.DoubleMaxAggregatorFactory;
import io.druid.query.aggregation.DoubleMinAggregatorFactory;
import io.druid.query.aggregation.FilteredAggregatorFactory;
import io.druid.query.aggregation.PostAggregator;
import io.druid.query.aggregation.cardinality.CardinalityAggregatorFactory;
import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory;
@ -54,6 +55,7 @@ import io.druid.query.extraction.RegexDimExtractionFn;
import io.druid.query.extraction.TimeFormatExtractionFn;
import io.druid.query.filter.AndDimFilter;
import io.druid.query.filter.DimFilter;
import io.druid.query.filter.ExtractionDimFilter;
import io.druid.query.filter.SelectorDimFilter;
import io.druid.query.spec.MultipleIntervalSegmentSpec;
import io.druid.query.timeseries.TimeseriesQuery;
@ -158,9 +160,7 @@ public class TopNQueryRunnerTest
QueryRunnerTestHelper.NoopIntervalChunkingQueryRunnerDecorator()
);
final QueryRunner<Result<TopNResultValue>> mergeRunner = chest.mergeResults(runner);
return mergeRunner.run(
query, context
);
return mergeRunner.run(query, context);
}
@Test
@ -3149,4 +3149,91 @@ public class TopNQueryRunnerTest
);
TestHelper.assertExpectedResults(expectedResults, runner.run(query, new HashMap<String, Object>()));
}
@Test
public void testTopNWithExtractionFilter()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("spot", "spot0");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
TopNQuery query = new TopNQueryBuilder().dataSource(QueryRunnerTestHelper.dataSource)
.granularity(QueryRunnerTestHelper.allGran)
.dimension(QueryRunnerTestHelper.marketDimension)
.metric("rows")
.threshold(3)
.intervals(QueryRunnerTestHelper.firstToThird)
.aggregators(QueryRunnerTestHelper.commonAggregators)
.postAggregators(Arrays.<PostAggregator>asList(QueryRunnerTestHelper.addRowsIndexConstant))
.filters(new ExtractionDimFilter(QueryRunnerTestHelper.marketDimension, "spot0", lookupExtractionFn, null))
.build();
List<Result<TopNResultValue>> expectedResults = Arrays.asList(
new Result<>(
new DateTime("2011-04-01T00:00:00.000Z"),
new TopNResultValue(
Arrays.<Map<String, Object>>asList(
ImmutableMap.<String, Object>of(
QueryRunnerTestHelper.marketDimension, "spot",
"rows", 18L,
"index", 2231.8768157958984D,
"addRowsIndexConstant", 2250.8768157958984D,
"uniques", QueryRunnerTestHelper.UNIQUES_9
)
)
)
)
);
assertExpectedResults(expectedResults, query);
}
@Test
public void testTopNWithExtractionFilterNoExistingValue()
{
Map<String, String> extractionMap = new HashMap<>();
extractionMap.put("","NULL");
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
DimFilter extractionFilter = new ExtractionDimFilter("null_column", "NULL", lookupExtractionFn, null);
TopNQueryBuilder topNQueryBuilder = new TopNQueryBuilder()
.dataSource(QueryRunnerTestHelper.dataSource)
.granularity(QueryRunnerTestHelper.allGran)
.dimension("null_column")
.metric(QueryRunnerTestHelper.indexMetric)
.threshold(4)
.intervals(QueryRunnerTestHelper.fullOnInterval)
.aggregators(Lists.newArrayList(Iterables.concat(QueryRunnerTestHelper.commonAggregators, Lists.newArrayList(
new FilteredAggregatorFactory(new DoubleMaxAggregatorFactory("maxIndex", "index"),
extractionFilter),
//new DoubleMaxAggregatorFactory("maxIndex", "index"),
new DoubleMinAggregatorFactory("minIndex", "index")))))
.postAggregators(Arrays.<PostAggregator>asList(QueryRunnerTestHelper.addRowsIndexConstant));
TopNQuery topNQueryWithNULLValueExtraction = topNQueryBuilder
.filters(extractionFilter)
.build();
Map<String, Object> map = Maps.newHashMap();
map.put("null_column", null);
map.put("rows", 1209L);
map.put("index", 503332.5071372986D);
map.put("addRowsIndexConstant", 504542.5071372986D);
map.put("uniques", QueryRunnerTestHelper.UNIQUES_9);
map.put("maxIndex", 1870.06103515625D);
map.put("minIndex", 59.02102279663086D);
List<Result<TopNResultValue>> expectedResults = Arrays.asList(
new Result<>(
new DateTime("2011-01-12T00:00:00.000Z"),
new TopNResultValue(
Arrays.asList(
map
)
)
)
);
assertExpectedResults(expectedResults, topNQueryWithNULLValueExtraction);
}
}