mirror of https://github.com/apache/druid.git
support cascade execution of extraction filters in extraction dimension spec
This commit is contained in:
parent
f6a1a4ae20
commit
238dd3be3c
|
@ -308,6 +308,39 @@ This allows distinguishing between a null dimension and a lookup resulting in a
|
|||
For example, specifying `{"":"bar","bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["bar", "oof", "baz"]`.
|
||||
Omitting the empty string key will cause the missing value to take over. For example, specifying `{"bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["oof", "oof", "baz"]`.
|
||||
|
||||
### Cascade Extraction Function
|
||||
|
||||
Provides chained execution of extraction functions.
|
||||
|
||||
A property of `extractionFns` contains an array of any extraction functions, which is executed in the array index order.
|
||||
|
||||
Example for chaining [regular expression extraction function](#regular-expression-extraction-function), [javascript extraction function](#javascript-extraction-function), and [substring extraction function](#substring-extraction-function) is as followings.
|
||||
|
||||
```json
|
||||
{
|
||||
"type" : "cascade",
|
||||
"extractionFns": [
|
||||
{
|
||||
"type" : "regex",
|
||||
"expr" : "/([^/]+)/",
|
||||
"replaceMissingValues": false,
|
||||
"replaceMissingValuesWith": null
|
||||
},
|
||||
{
|
||||
"type" : "javascript",
|
||||
"function" : "function(str) { return \"the \".concat(str) }"
|
||||
},
|
||||
{
|
||||
"type" : "substring",
|
||||
"index" : 0, "length" : 7
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
It will transform dimension values with specified extraction functions in the order named.
|
||||
For example, `'/druid/prod/historical'` is transformed to `'the dru'` as regular expression extraction function first transforms it to `'druid'` and then, javascript extraction function transforms it to `'the druid'`, and lastly, substring extraction function transforms it to `'the dru'`.
|
||||
|
||||
### Filtering DimensionSpecs
|
||||
|
||||
These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases.
|
||||
|
@ -317,7 +350,7 @@ Then groupBy/topN processing pipeline "explodes" all multi-valued dimensions res
|
|||
|
||||
In addition to "query filter" which efficiently selects the rows to be processed, you can use the filtering dimension spec to filter for specific values within the values of a multi-valued dimension. These dimensionSpecs take a delegate DimensionSpec and a filtering criteria. From the "exploded" rows, only rows matching the given filtering criteria are returned in the query result.
|
||||
|
||||
The following filtered dimension spec acts as a whiltelist or blacklist for values as per the "isWhitelist" attribute value.
|
||||
The following filtered dimension spec acts as a whitelist or blacklist for values as per the "isWhitelist" attribute value.
|
||||
```json
|
||||
{ "type" : "listFiltered", "delegate" : <dimensionSpec>, "values": <array of strings>, "isWhitelist": <optional attribute for true/false, default is true> }
|
||||
```
|
||||
|
|
|
@ -207,7 +207,7 @@ The `simpleJson` lookupParseSpec does not take any parameters. It is simply a li
|
|||
|
||||
```json
|
||||
"namespaceParseSpec":{
|
||||
"type": "simpleJson"
|
||||
"format": "simpleJson"
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -0,0 +1,236 @@
|
|||
/*
|
||||
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Metamarkets licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package io.druid.query.extraction;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.primitives.Bytes;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class CascadeExtractionFn implements ExtractionFn
|
||||
{
|
||||
private static final byte CACHE_TYPE_ID = 0x9;
|
||||
|
||||
private final ExtractionFn extractionFns[];
|
||||
private final ChainedExtractionFn chainedExtractionFn;
|
||||
private final ChainedExtractionFn DEFAULT_CHAINED_EXTRACTION_FN = new ChainedExtractionFn(
|
||||
new ExtractionFn() {
|
||||
public byte[] getCacheKey() {
|
||||
return new byte[0];
|
||||
}
|
||||
|
||||
public String apply(Object value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public String apply(String value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public String apply(long value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean preservesOrdering() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public ExtractionType getExtractionType() {
|
||||
return ExtractionType.MANY_TO_ONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "nullExtractionFn{}";
|
||||
}
|
||||
},
|
||||
null
|
||||
);
|
||||
|
||||
@JsonCreator
|
||||
public CascadeExtractionFn(
|
||||
@JsonProperty("extractionFns") ExtractionFn[] extractionFn
|
||||
)
|
||||
{
|
||||
Preconditions.checkArgument(extractionFn != null, "extractionFns should not be null");
|
||||
this.extractionFns = extractionFn;
|
||||
if (extractionFns.length == 0) {
|
||||
this.chainedExtractionFn = DEFAULT_CHAINED_EXTRACTION_FN;
|
||||
} else {
|
||||
ChainedExtractionFn root = null;
|
||||
for (int idx = 0; idx < extractionFns.length; idx++) {
|
||||
Preconditions.checkArgument(extractionFns[idx] != null, "empty function is not allowed");
|
||||
root = new ChainedExtractionFn(extractionFns[idx], root);
|
||||
}
|
||||
this.chainedExtractionFn = root;
|
||||
}
|
||||
}
|
||||
|
||||
@JsonProperty
|
||||
public ExtractionFn[] getExtractionFns() {
|
||||
return extractionFns;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getCacheKey() {
|
||||
byte[] cacheKey = new byte[] {CACHE_TYPE_ID};
|
||||
|
||||
return Bytes.concat(cacheKey, chainedExtractionFn.getCacheKey());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(Object value) {
|
||||
return chainedExtractionFn.apply(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(String value){
|
||||
return chainedExtractionFn.apply(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(long value){
|
||||
return chainedExtractionFn.apply(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean preservesOrdering(){
|
||||
return chainedExtractionFn.preservesOrdering();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtractionType getExtractionType(){
|
||||
return chainedExtractionFn.getExtractionType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o)
|
||||
{
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
CascadeExtractionFn that = (CascadeExtractionFn) o;
|
||||
|
||||
if (!Arrays.equals(extractionFns, that.extractionFns)) {
|
||||
return false;
|
||||
}
|
||||
if (!chainedExtractionFn.equals(that.chainedExtractionFn)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
return chainedExtractionFn.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "CascadeExtractionFn{" +
|
||||
"extractionFns=[" + chainedExtractionFn.toString() + "]}";
|
||||
}
|
||||
|
||||
private class ChainedExtractionFn {
|
||||
private final ExtractionFn fn;
|
||||
private final ChainedExtractionFn child;
|
||||
|
||||
public ChainedExtractionFn(ExtractionFn fn, ChainedExtractionFn child) {
|
||||
this.fn = fn;
|
||||
this.child = child;
|
||||
}
|
||||
|
||||
public byte[] getCacheKey() {
|
||||
byte[] fnCacheKey = fn.getCacheKey();
|
||||
|
||||
return (child != null) ? Bytes.concat(fnCacheKey, child.getCacheKey()) : fnCacheKey;
|
||||
}
|
||||
|
||||
public String apply(Object value) {
|
||||
return fn.apply((child != null) ? child.apply(value) : value);
|
||||
}
|
||||
|
||||
public String apply(String value){
|
||||
return fn.apply((child != null) ? child.apply(value) : value);
|
||||
}
|
||||
|
||||
public String apply(long value){
|
||||
return fn.apply((child != null) ? child.apply(value) : value);
|
||||
}
|
||||
|
||||
public boolean preservesOrdering(){
|
||||
boolean childPreservesOrdering = (child == null) || child.preservesOrdering();
|
||||
return fn.preservesOrdering() && childPreservesOrdering;
|
||||
}
|
||||
|
||||
public ExtractionType getExtractionType(){
|
||||
if (child != null && child.getExtractionType() == ExtractionType.MANY_TO_ONE) {
|
||||
return ExtractionType.MANY_TO_ONE;
|
||||
} else {
|
||||
return fn.getExtractionType();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean equals(Object o)
|
||||
{
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ChainedExtractionFn that = (ChainedExtractionFn) o;
|
||||
|
||||
if (!fn.equals(that.fn)) {
|
||||
return false;
|
||||
}
|
||||
if (child != null && !child.equals(that.child)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public int hashCode()
|
||||
{
|
||||
int result = fn.hashCode();
|
||||
if (child != null) {
|
||||
result = 31 * result + child.hashCode();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return (child != null)
|
||||
? Joiner.on(",").join(child.toString(), fn.toString())
|
||||
: fn.toString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -34,7 +34,8 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo;
|
|||
@JsonSubTypes.Type(name = "timeFormat", value = TimeFormatExtractionFn.class),
|
||||
@JsonSubTypes.Type(name = "identity", value = IdentityExtractionFn.class),
|
||||
@JsonSubTypes.Type(name = "lookup", value = LookupExtractionFn.class),
|
||||
@JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class)
|
||||
@JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class),
|
||||
@JsonSubTypes.Type(name = "cascade", value = CascadeExtractionFn.class)
|
||||
})
|
||||
/**
|
||||
* An ExtractionFn is a function that can be used to transform the values of a column (typically a dimension)
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
/*
|
||||
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Metamarkets licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package io.druid.query.extraction;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Sets;
|
||||
import io.druid.jackson.DefaultObjectMapper;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
public class CascadeExtractionFnTest {
|
||||
private static final String[] paths = {
|
||||
"/druid/prod/historical",
|
||||
"/druid/prod/broker",
|
||||
"/druid/prod/coordinator",
|
||||
"/druid/demo/historical",
|
||||
"/druid/demo/broker",
|
||||
"/druid/demo/coordinator",
|
||||
"/dash/aloe",
|
||||
"/dash/baloo"
|
||||
};
|
||||
|
||||
private final String regex = "/([^/]+)/";
|
||||
private final String function = "function(str) { return \"the \".concat(str) }";
|
||||
private final RegexDimExtractionFn regexDimExtractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||
private final JavaScriptExtractionFn javascriptExtractionFn = new JavaScriptExtractionFn(function, true);
|
||||
private final SubstringDimExtractionFn substringDimExtractionFn = new SubstringDimExtractionFn(0, 7);
|
||||
private final String regexDimExtractionFnJson = "{ \"type\" : \"regex\", \"expr\" : \"/([^/]+)/\" , " +
|
||||
"\"replaceMissingValues\": false, \"replaceMissingValuesWith\": null}";
|
||||
private final String javascriptExtractionFnJson =
|
||||
"{ \"type\" : \"javascript\", \"function\" : \"function(str) { return \\\"the \\\".concat(str) }\" }";
|
||||
private final String substringDimExtractionFnJson = "{ \"type\" : \"substring\", \"index\" : 0, \"length\" : 7 }";
|
||||
|
||||
@Test
|
||||
public void testRegexAndJavascriptAndSubstring()
|
||||
{
|
||||
ExtractionFn[] fns = new ExtractionFn[3];
|
||||
fns[0] = regexDimExtractionFn;
|
||||
fns[1] = javascriptExtractionFn;
|
||||
fns[2] = substringDimExtractionFn;
|
||||
|
||||
CascadeExtractionFn cascadeExtractionFn = new CascadeExtractionFn(fns);
|
||||
|
||||
Set<String> extracted = Sets.newLinkedHashSet();
|
||||
for (String path: paths) {
|
||||
extracted.add(cascadeExtractionFn.apply(path));
|
||||
}
|
||||
|
||||
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("the dru", "the das"));
|
||||
Assert.assertEquals(expected, extracted);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetCacheKey()
|
||||
{
|
||||
ExtractionFn[] fns1 = new ExtractionFn[2];
|
||||
fns1[0] = javascriptExtractionFn;
|
||||
fns1[1] = regexDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
|
||||
|
||||
ExtractionFn[] fns2 = new ExtractionFn[2];
|
||||
fns2[0] = regexDimExtractionFn;
|
||||
fns2[1] = javascriptExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
|
||||
|
||||
ExtractionFn[] fns3 = new ExtractionFn[3];
|
||||
fns3[0] = regexDimExtractionFn;
|
||||
fns3[1] = javascriptExtractionFn;
|
||||
fns3[2] = substringDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
|
||||
|
||||
|
||||
Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn2.getCacheKey()));
|
||||
Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn3.getCacheKey()));
|
||||
Assert.assertFalse(Arrays.equals(cascadeExtractionFn2.getCacheKey(), cascadeExtractionFn3.getCacheKey()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHashCode()
|
||||
{
|
||||
ExtractionFn[] fns1 = new ExtractionFn[2];
|
||||
fns1[0] = javascriptExtractionFn;
|
||||
fns1[1] = regexDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
|
||||
|
||||
ExtractionFn[] fns2 = new ExtractionFn[2];
|
||||
fns2[0] = regexDimExtractionFn;
|
||||
fns2[1] = javascriptExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
|
||||
|
||||
ExtractionFn[] fns3 = new ExtractionFn[3];
|
||||
fns3[0] = regexDimExtractionFn;
|
||||
fns3[1] = javascriptExtractionFn;
|
||||
fns3[2] = substringDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
|
||||
|
||||
Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn2.hashCode());
|
||||
Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn3.hashCode());
|
||||
Assert.assertNotEquals(cascadeExtractionFn2.hashCode(), cascadeExtractionFn3.hashCode());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreservesOrdering()
|
||||
{
|
||||
ExtractionFn[] fns1 = new ExtractionFn[1];
|
||||
fns1[0] = substringDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
|
||||
|
||||
ExtractionFn[] fns2 = new ExtractionFn[2];
|
||||
fns2[0] = regexDimExtractionFn;
|
||||
fns2[1] = substringDimExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
|
||||
|
||||
ExtractionFn[] fns3 = new ExtractionFn[2];
|
||||
fns3[0] = substringDimExtractionFn;
|
||||
fns3[1] = javascriptExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3);
|
||||
|
||||
Assert.assertTrue(cascadeExtractionFn1.preservesOrdering());
|
||||
Assert.assertFalse(cascadeExtractionFn2.preservesOrdering());
|
||||
Assert.assertFalse(cascadeExtractionFn3.preservesOrdering());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetExtractionType()
|
||||
{
|
||||
ExtractionFn[] fns1 = new ExtractionFn[1];
|
||||
fns1[0] = javascriptExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1);
|
||||
|
||||
ExtractionFn[] fns2 = new ExtractionFn[2];
|
||||
fns2[0] = regexDimExtractionFn;
|
||||
fns2[1] = javascriptExtractionFn;
|
||||
CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2);
|
||||
|
||||
Assert.assertTrue(cascadeExtractionFn1.getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE);
|
||||
Assert.assertTrue(cascadeExtractionFn2.getExtractionType() == ExtractionFn.ExtractionType.MANY_TO_ONE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSerde() throws Exception
|
||||
{
|
||||
final ObjectMapper objectMapper = new DefaultObjectMapper();
|
||||
|
||||
final String json = "{\"type\" : \"cascade\", \"extractionFns\": ["+
|
||||
regexDimExtractionFnJson + "," + javascriptExtractionFnJson + "," + substringDimExtractionFnJson + "]}";
|
||||
|
||||
CascadeExtractionFn cascadeExtractionFn = (CascadeExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
|
||||
RegexDimExtractionFn regexDimExtractionFn =
|
||||
(RegexDimExtractionFn) objectMapper.readValue(regexDimExtractionFnJson, ExtractionFn.class);
|
||||
JavaScriptExtractionFn javascriptExtractionFn =
|
||||
(JavaScriptExtractionFn) objectMapper.readValue(javascriptExtractionFnJson, ExtractionFn.class);
|
||||
SubstringDimExtractionFn substringDimExtractionFn =
|
||||
(SubstringDimExtractionFn) objectMapper.readValue(substringDimExtractionFnJson, ExtractionFn.class);
|
||||
|
||||
Assert.assertEquals(regexDimExtractionFn, cascadeExtractionFn.getExtractionFns()[0]);
|
||||
Assert.assertEquals(javascriptExtractionFn, cascadeExtractionFn.getExtractionFns()[1]);
|
||||
Assert.assertEquals(substringDimExtractionFn, cascadeExtractionFn.getExtractionFns()[2]);
|
||||
|
||||
Assert.assertEquals(
|
||||
cascadeExtractionFn,
|
||||
objectMapper.readValue(
|
||||
objectMapper.writeValueAsBytes(cascadeExtractionFn),
|
||||
ExtractionFn.class
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue