support cascade execution of extraction filters in extraction dimension spec

This commit is contained in:
Keuntae Park 2016-01-06 17:15:08 +09:00
parent f6a1a4ae20
commit 238dd3be3c
5 changed files with 462 additions and 3 deletions

View File

@ -308,6 +308,39 @@ This allows distinguishing between a null dimension and a lookup resulting in a
For example, specifying `{"":"bar","bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["bar", "oof", "baz"]`.
Omitting the empty string key will cause the missing value to take over. For example, specifying `{"bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["oof", "oof", "baz"]`.
### Cascade Extraction Function
Provides chained execution of extraction functions.
A property of `extractionFns` contains an array of any extraction functions, which is executed in the array index order.
Example for chaining [regular expression extraction function](#regular-expression-extraction-function), [javascript extraction function](#javascript-extraction-function), and [substring extraction function](#substring-extraction-function) is as followings.
```json
{
"type" : "cascade",
"extractionFns": [
{
"type" : "regex",
"expr" : "/([^/]+)/",
"replaceMissingValues": false,
"replaceMissingValuesWith": null
},
{
"type" : "javascript",
"function" : "function(str) { return \"the \".concat(str) }"
},
{
"type" : "substring",
"index" : 0, "length" : 7
}
]
}
```
It will transform dimension values with specified extraction functions in the order named.
For example, `'/druid/prod/historical'` is transformed to `'the dru'` as regular expression extraction function first transforms it to `'druid'` and then, javascript extraction function transforms it to `'the druid'`, and lastly, substring extraction function transforms it to `'the dru'`.
### Filtering DimensionSpecs
These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases.
@ -317,7 +350,7 @@ Then groupBy/topN processing pipeline "explodes" all multi-valued dimensions res
In addition to "query filter" which efficiently selects the rows to be processed, you can use the filtering dimension spec to filter for specific values within the values of a multi-valued dimension. These dimensionSpecs take a delegate DimensionSpec and a filtering criteria. From the "exploded" rows, only rows matching the given filtering criteria are returned in the query result.
The following filtered dimension spec acts as a whiltelist or blacklist for values as per the "isWhitelist" attribute value.
The following filtered dimension spec acts as a whitelist or blacklist for values as per the "isWhitelist" attribute value.
```json
{ "type" : "listFiltered", "delegate" : <dimensionSpec>, "values": <array of strings>, "isWhitelist": <optional attribute for true/false, default is true> }
```

View File

@ -207,7 +207,7 @@ The `simpleJson` lookupParseSpec does not take any parameters. It is simply a li
```json
"namespaceParseSpec":{
"type": "simpleJson"
"format": "simpleJson"
}
```

View File

@ -0,0 +1,236 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.query.extraction;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.primitives.Bytes;
import java.util.Arrays;
public class CascadeExtractionFn implements ExtractionFn
{
private static final byte CACHE_TYPE_ID = 0x9;
private final ExtractionFn extractionFns[];
private final ChainedExtractionFn chainedExtractionFn;
private final ChainedExtractionFn DEFAULT_CHAINED_EXTRACTION_FN = new ChainedExtractionFn(
new ExtractionFn() {
public byte[] getCacheKey() {
return new byte[0];
}
public String apply(Object value) {
return null;
}
public String apply(String value) {
return null;
}
public String apply(long value) {
return null;
}
public boolean preservesOrdering() {
return false;
}
public ExtractionType getExtractionType() {
return ExtractionType.MANY_TO_ONE;
}
@Override
public String toString() {
return "nullExtractionFn{}";
}
},
null
);
@JsonCreator
public CascadeExtractionFn(
@JsonProperty("extractionFns") ExtractionFn[] extractionFn
)
{
Preconditions.checkArgument(extractionFn != null, "extractionFns should not be null");
this.extractionFns = extractionFn;
if (extractionFns.length == 0) {
this.chainedExtractionFn = DEFAULT_CHAINED_EXTRACTION_FN;
} else {
ChainedExtractionFn root = null;
for (int idx = 0; idx < extractionFns.length; idx++) {
Preconditions.checkArgument(extractionFns[idx] != null, "empty function is not allowed");
root = new ChainedExtractionFn(extractionFns[idx], root);
}
this.chainedExtractionFn = root;
}
}
@JsonProperty
public ExtractionFn[] getExtractionFns() {
return extractionFns;
}
@Override
public byte[] getCacheKey() {
byte[] cacheKey = new byte[] {CACHE_TYPE_ID};
return Bytes.concat(cacheKey, chainedExtractionFn.getCacheKey());
}
@Override
public String apply(Object value) {
return chainedExtractionFn.apply(value);
}
@Override
public String apply(String value){
return chainedExtractionFn.apply(value);
}
@Override
public String apply(long value){
return chainedExtractionFn.apply(value);
}
@Override
public boolean preservesOrdering(){
return chainedExtractionFn.preservesOrdering();
}
@Override
public ExtractionType getExtractionType(){
return chainedExtractionFn.getExtractionType();
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
CascadeExtractionFn that = (CascadeExtractionFn) o;
if (!Arrays.equals(extractionFns, that.extractionFns)) {
return false;
}
if (!chainedExtractionFn.equals(that.chainedExtractionFn)) {
return false;
}
return true;
}
@Override
public int hashCode()
{
return chainedExtractionFn.hashCode();
}
@Override
public String toString() {
return "CascadeExtractionFn{" +
"extractionFns=[" + chainedExtractionFn.toString() + "]}";
}
private class ChainedExtractionFn {
private final ExtractionFn fn;
private final ChainedExtractionFn child;
public ChainedExtractionFn(ExtractionFn fn, ChainedExtractionFn child) {
this.fn = fn;
this.child = child;
}
public byte[] getCacheKey() {
byte[] fnCacheKey = fn.getCacheKey();
return (child != null) ? Bytes.concat(fnCacheKey, child.getCacheKey()) : fnCacheKey;
}
public String apply(Object value) {
return fn.apply((child != null) ? child.apply(value) : value);
}
public String apply(String value){
return fn.apply((child != null) ? child.apply(value) : value);
}
public String apply(long value){
return fn.apply((child != null) ? child.apply(value) : value);
}
public boolean preservesOrdering(){
boolean childPreservesOrdering = (child == null) || child.preservesOrdering();
return fn.preservesOrdering() && childPreservesOrdering;
}
public ExtractionType getExtractionType(){
if (child != null && child.getExtractionType() == ExtractionType.MANY_TO_ONE) {
return ExtractionType.MANY_TO_ONE;
} else {
return fn.getExtractionType();
}
}
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
ChainedExtractionFn that = (ChainedExtractionFn) o;
if (!fn.equals(that.fn)) {
return false;
}
if (child != null && !child.equals(that.child)) {
return false;
}
return true;
}
public int hashCode()
{
int result = fn.hashCode();
if (child != null) {
result = 31 * result + child.hashCode();
}
return result;
}
public String toString() {
return (child != null)
? Joiner.on(",").join(child.toString(), fn.toString())
: fn.toString();
}
}
}

View File

@ -34,7 +34,8 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo;
@JsonSubTypes.Type(name = "timeFormat", value = TimeFormatExtractionFn.class),
@JsonSubTypes.Type(name = "identity", value = IdentityExtractionFn.class),
@JsonSubTypes.Type(name = "lookup", value = LookupExtractionFn.class),
@JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class)
@JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class),
@JsonSubTypes.Type(name = "cascade", value = CascadeExtractionFn.class)
})
/**
* An ExtractionFn is a function that can be used to transform the values of a column (typically a dimension)

View File

@ -0,0 +1,189 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.query.extraction;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert;
import org.junit.Test;
import java.util.Arrays;
import java.util.Set;
public class CascadeExtractionFnTest {
private static final String[] paths = {
"/druid/prod/historical",
"/druid/prod/broker",
"/druid/prod/coordinator",
"/druid/demo/historical",
"/druid/demo/broker",
"/druid/demo/coordinator",
"/dash/aloe",
"/dash/baloo"
};
private final String regex = "/([^/]+)/";
private final String function = "function(str) { return \"the \".concat(str) }";
private final RegexDimExtractionFn regexDimExtractionFn = new RegexDimExtractionFn(regex, false, null);
private final JavaScriptExtractionFn javascriptExtractionFn = new JavaScriptExtractionFn(function, true);
private final SubstringDimExtractionFn substringDimExtractionFn = new SubstringDimExtractionFn(0, 7);
private final String regexDimExtractionFnJson = "{ \"type\" : \"regex\", \"expr\" : \"/([^/]+)/\" , " +
"\"replaceMissingValues\": false, \"replaceMissingValuesWith\": null}";
private final String javascriptExtractionFnJson =
"{ \"type\" : \"javascript\", \"function\" : \"function(str) { return \\\"the \\\".concat(str) }\" }";
private final String substringDimExtractionFnJson = "{ \"type\" : \"substring\", \"index\" : 0, \"length\" : 7 }";
@Test
public void testRegexAndJavascriptAndSubstring()
{
ExtractionFn[] fns = new ExtractionFn[3];
fns[0] = regexDimExtractionFn;
fns[1] = javascriptExtractionFn;
fns[2] = substringDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn = new CascadeExtractionFn(fns);
Set<String> extracted = Sets.newLinkedHashSet();
for (String path: paths) {
extracted.add(cascadeExtractionFn.apply(path));
}
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("the dru", "the das"));
Assert.assertEquals(expected, extracted);
}
@Test
public void testGetCacheKey()
{
ExtractionFn[] fns1 = new ExtractionFn[2];
fns1[0] = javascriptExtractionFn;
fns1[1] = regexDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
ExtractionFn[] fns2 = new ExtractionFn[2];
fns2[0] = regexDimExtractionFn;
fns2[1] = javascriptExtractionFn;
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
ExtractionFn[] fns3 = new ExtractionFn[3];
fns3[0] = regexDimExtractionFn;
fns3[1] = javascriptExtractionFn;
fns3[2] = substringDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn2.getCacheKey()));
Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn3.getCacheKey()));
Assert.assertFalse(Arrays.equals(cascadeExtractionFn2.getCacheKey(), cascadeExtractionFn3.getCacheKey()));
}
@Test
public void testHashCode()
{
ExtractionFn[] fns1 = new ExtractionFn[2];
fns1[0] = javascriptExtractionFn;
fns1[1] = regexDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
ExtractionFn[] fns2 = new ExtractionFn[2];
fns2[0] = regexDimExtractionFn;
fns2[1] = javascriptExtractionFn;
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
ExtractionFn[] fns3 = new ExtractionFn[3];
fns3[0] = regexDimExtractionFn;
fns3[1] = javascriptExtractionFn;
fns3[2] = substringDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn2.hashCode());
Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn3.hashCode());
Assert.assertNotEquals(cascadeExtractionFn2.hashCode(), cascadeExtractionFn3.hashCode());
}
@Test
public void testPreservesOrdering()
{
ExtractionFn[] fns1 = new ExtractionFn[1];
fns1[0] = substringDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
ExtractionFn[] fns2 = new ExtractionFn[2];
fns2[0] = regexDimExtractionFn;
fns2[1] = substringDimExtractionFn;
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
ExtractionFn[] fns3 = new ExtractionFn[2];
fns3[0] = substringDimExtractionFn;
fns3[1] = javascriptExtractionFn;
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
Assert.assertTrue(cascadeExtractionFn1.preservesOrdering());
Assert.assertFalse(cascadeExtractionFn2.preservesOrdering());
Assert.assertFalse(cascadeExtractionFn3.preservesOrdering());
}
@Test
public void testGetExtractionType()
{
ExtractionFn[] fns1 = new ExtractionFn[1];
fns1[0] = javascriptExtractionFn;
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
ExtractionFn[] fns2 = new ExtractionFn[2];
fns2[0] = regexDimExtractionFn;
fns2[1] = javascriptExtractionFn;
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
Assert.assertTrue(cascadeExtractionFn1.getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE);
Assert.assertTrue(cascadeExtractionFn2.getExtractionType() == ExtractionFn.ExtractionType.MANY_TO_ONE);
}
@Test
public void testSerde() throws Exception
{
final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{\"type\" : \"cascade\", \"extractionFns\": ["+
regexDimExtractionFnJson + "," + javascriptExtractionFnJson + "," + substringDimExtractionFnJson + "]}";
CascadeExtractionFn cascadeExtractionFn = (CascadeExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
RegexDimExtractionFn regexDimExtractionFn =
(RegexDimExtractionFn) objectMapper.readValue(regexDimExtractionFnJson, ExtractionFn.class);
JavaScriptExtractionFn javascriptExtractionFn =
(JavaScriptExtractionFn) objectMapper.readValue(javascriptExtractionFnJson, ExtractionFn.class);
SubstringDimExtractionFn substringDimExtractionFn =
(SubstringDimExtractionFn) objectMapper.readValue(substringDimExtractionFnJson, ExtractionFn.class);
Assert.assertEquals(regexDimExtractionFn, cascadeExtractionFn.getExtractionFns()[0]);
Assert.assertEquals(javascriptExtractionFn, cascadeExtractionFn.getExtractionFns()[1]);
Assert.assertEquals(substringDimExtractionFn, cascadeExtractionFn.getExtractionFns()[2]);
Assert.assertEquals(
cascadeExtractionFn,
objectMapper.readValue(
objectMapper.writeValueAsBytes(cascadeExtractionFn),
ExtractionFn.class
)
);
}
}