From 238dd3be3c1973ed27eeb10adcebefa17605429f Mon Sep 17 00:00:00 2001 From: Keuntae Park Date: Wed, 6 Jan 2016 17:15:08 +0900 Subject: [PATCH] support cascade execution of extraction filters in extraction dimension spec --- docs/content/querying/dimensionspecs.md | 35 ++- docs/content/querying/lookups.md | 2 +- .../query/extraction/CascadeExtractionFn.java | 236 ++++++++++++++++++ .../druid/query/extraction/ExtractionFn.java | 3 +- .../extraction/CascadeExtractionFnTest.java | 189 ++++++++++++++ 5 files changed, 462 insertions(+), 3 deletions(-) create mode 100644 processing/src/main/java/io/druid/query/extraction/CascadeExtractionFn.java create mode 100644 processing/src/test/java/io/druid/query/extraction/CascadeExtractionFnTest.java diff --git a/docs/content/querying/dimensionspecs.md b/docs/content/querying/dimensionspecs.md index b2cbd6c1aba..8879290db43 100644 --- a/docs/content/querying/dimensionspecs.md +++ b/docs/content/querying/dimensionspecs.md @@ -308,6 +308,39 @@ This allows distinguishing between a null dimension and a lookup resulting in a For example, specifying `{"":"bar","bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["bar", "oof", "baz"]`. Omitting the empty string key will cause the missing value to take over. For example, specifying `{"bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["oof", "oof", "baz"]`. +### Cascade Extraction Function + +Provides chained execution of extraction functions. + +A property of `extractionFns` contains an array of any extraction functions, which is executed in the array index order. + +Example for chaining [regular expression extraction function](#regular-expression-extraction-function), [javascript extraction function](#javascript-extraction-function), and [substring extraction function](#substring-extraction-function) is as followings. + +```json +{ + "type" : "cascade", + "extractionFns": [ + { + "type" : "regex", + "expr" : "/([^/]+)/", + "replaceMissingValues": false, + "replaceMissingValuesWith": null + }, + { + "type" : "javascript", + "function" : "function(str) { return \"the \".concat(str) }" + }, + { + "type" : "substring", + "index" : 0, "length" : 7 + } + ] +} +``` + +It will transform dimension values with specified extraction functions in the order named. +For example, `'/druid/prod/historical'` is transformed to `'the dru'` as regular expression extraction function first transforms it to `'druid'` and then, javascript extraction function transforms it to `'the druid'`, and lastly, substring extraction function transforms it to `'the dru'`. + ### Filtering DimensionSpecs These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases. @@ -317,7 +350,7 @@ Then groupBy/topN processing pipeline "explodes" all multi-valued dimensions res In addition to "query filter" which efficiently selects the rows to be processed, you can use the filtering dimension spec to filter for specific values within the values of a multi-valued dimension. These dimensionSpecs take a delegate DimensionSpec and a filtering criteria. From the "exploded" rows, only rows matching the given filtering criteria are returned in the query result. -The following filtered dimension spec acts as a whiltelist or blacklist for values as per the "isWhitelist" attribute value. +The following filtered dimension spec acts as a whitelist or blacklist for values as per the "isWhitelist" attribute value. ```json { "type" : "listFiltered", "delegate" : , "values": , "isWhitelist": } ``` diff --git a/docs/content/querying/lookups.md b/docs/content/querying/lookups.md index 6e378698621..812ef3757e0 100644 --- a/docs/content/querying/lookups.md +++ b/docs/content/querying/lookups.md @@ -207,7 +207,7 @@ The `simpleJson` lookupParseSpec does not take any parameters. It is simply a li ```json "namespaceParseSpec":{ - "type": "simpleJson" + "format": "simpleJson" } ``` diff --git a/processing/src/main/java/io/druid/query/extraction/CascadeExtractionFn.java b/processing/src/main/java/io/druid/query/extraction/CascadeExtractionFn.java new file mode 100644 index 00000000000..a7dc5d13f6e --- /dev/null +++ b/processing/src/main/java/io/druid/query/extraction/CascadeExtractionFn.java @@ -0,0 +1,236 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.extraction; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.primitives.Bytes; + +import java.util.Arrays; + +public class CascadeExtractionFn implements ExtractionFn +{ + private static final byte CACHE_TYPE_ID = 0x9; + + private final ExtractionFn extractionFns[]; + private final ChainedExtractionFn chainedExtractionFn; + private final ChainedExtractionFn DEFAULT_CHAINED_EXTRACTION_FN = new ChainedExtractionFn( + new ExtractionFn() { + public byte[] getCacheKey() { + return new byte[0]; + } + + public String apply(Object value) { + return null; + } + + public String apply(String value) { + return null; + } + + public String apply(long value) { + return null; + } + + public boolean preservesOrdering() { + return false; + } + + public ExtractionType getExtractionType() { + return ExtractionType.MANY_TO_ONE; + } + + @Override + public String toString() { + return "nullExtractionFn{}"; + } + }, + null + ); + + @JsonCreator + public CascadeExtractionFn( + @JsonProperty("extractionFns") ExtractionFn[] extractionFn + ) + { + Preconditions.checkArgument(extractionFn != null, "extractionFns should not be null"); + this.extractionFns = extractionFn; + if (extractionFns.length == 0) { + this.chainedExtractionFn = DEFAULT_CHAINED_EXTRACTION_FN; + } else { + ChainedExtractionFn root = null; + for (int idx = 0; idx < extractionFns.length; idx++) { + Preconditions.checkArgument(extractionFns[idx] != null, "empty function is not allowed"); + root = new ChainedExtractionFn(extractionFns[idx], root); + } + this.chainedExtractionFn = root; + } + } + + @JsonProperty + public ExtractionFn[] getExtractionFns() { + return extractionFns; + } + + @Override + public byte[] getCacheKey() { + byte[] cacheKey = new byte[] {CACHE_TYPE_ID}; + + return Bytes.concat(cacheKey, chainedExtractionFn.getCacheKey()); + } + + @Override + public String apply(Object value) { + return chainedExtractionFn.apply(value); + } + + @Override + public String apply(String value){ + return chainedExtractionFn.apply(value); + } + + @Override + public String apply(long value){ + return chainedExtractionFn.apply(value); + } + + @Override + public boolean preservesOrdering(){ + return chainedExtractionFn.preservesOrdering(); + } + + @Override + public ExtractionType getExtractionType(){ + return chainedExtractionFn.getExtractionType(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + CascadeExtractionFn that = (CascadeExtractionFn) o; + + if (!Arrays.equals(extractionFns, that.extractionFns)) { + return false; + } + if (!chainedExtractionFn.equals(that.chainedExtractionFn)) { + return false; + } + + return true; + } + + @Override + public int hashCode() + { + return chainedExtractionFn.hashCode(); + } + + @Override + public String toString() { + return "CascadeExtractionFn{" + + "extractionFns=[" + chainedExtractionFn.toString() + "]}"; + } + + private class ChainedExtractionFn { + private final ExtractionFn fn; + private final ChainedExtractionFn child; + + public ChainedExtractionFn(ExtractionFn fn, ChainedExtractionFn child) { + this.fn = fn; + this.child = child; + } + + public byte[] getCacheKey() { + byte[] fnCacheKey = fn.getCacheKey(); + + return (child != null) ? Bytes.concat(fnCacheKey, child.getCacheKey()) : fnCacheKey; + } + + public String apply(Object value) { + return fn.apply((child != null) ? child.apply(value) : value); + } + + public String apply(String value){ + return fn.apply((child != null) ? child.apply(value) : value); + } + + public String apply(long value){ + return fn.apply((child != null) ? child.apply(value) : value); + } + + public boolean preservesOrdering(){ + boolean childPreservesOrdering = (child == null) || child.preservesOrdering(); + return fn.preservesOrdering() && childPreservesOrdering; + } + + public ExtractionType getExtractionType(){ + if (child != null && child.getExtractionType() == ExtractionType.MANY_TO_ONE) { + return ExtractionType.MANY_TO_ONE; + } else { + return fn.getExtractionType(); + } + } + + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + ChainedExtractionFn that = (ChainedExtractionFn) o; + + if (!fn.equals(that.fn)) { + return false; + } + if (child != null && !child.equals(that.child)) { + return false; + } + + return true; + } + + public int hashCode() + { + int result = fn.hashCode(); + if (child != null) { + result = 31 * result + child.hashCode(); + } + return result; + } + + public String toString() { + return (child != null) + ? Joiner.on(",").join(child.toString(), fn.toString()) + : fn.toString(); + } + } +} diff --git a/processing/src/main/java/io/druid/query/extraction/ExtractionFn.java b/processing/src/main/java/io/druid/query/extraction/ExtractionFn.java index f32aefc252d..800bc8e4054 100644 --- a/processing/src/main/java/io/druid/query/extraction/ExtractionFn.java +++ b/processing/src/main/java/io/druid/query/extraction/ExtractionFn.java @@ -34,7 +34,8 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; @JsonSubTypes.Type(name = "timeFormat", value = TimeFormatExtractionFn.class), @JsonSubTypes.Type(name = "identity", value = IdentityExtractionFn.class), @JsonSubTypes.Type(name = "lookup", value = LookupExtractionFn.class), - @JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class) + @JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class), + @JsonSubTypes.Type(name = "cascade", value = CascadeExtractionFn.class) }) /** * An ExtractionFn is a function that can be used to transform the values of a column (typically a dimension) diff --git a/processing/src/test/java/io/druid/query/extraction/CascadeExtractionFnTest.java b/processing/src/test/java/io/druid/query/extraction/CascadeExtractionFnTest.java new file mode 100644 index 00000000000..0799fd4b292 --- /dev/null +++ b/processing/src/test/java/io/druid/query/extraction/CascadeExtractionFnTest.java @@ -0,0 +1,189 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.extraction; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import io.druid.jackson.DefaultObjectMapper; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Set; + +public class CascadeExtractionFnTest { + private static final String[] paths = { + "/druid/prod/historical", + "/druid/prod/broker", + "/druid/prod/coordinator", + "/druid/demo/historical", + "/druid/demo/broker", + "/druid/demo/coordinator", + "/dash/aloe", + "/dash/baloo" + }; + + private final String regex = "/([^/]+)/"; + private final String function = "function(str) { return \"the \".concat(str) }"; + private final RegexDimExtractionFn regexDimExtractionFn = new RegexDimExtractionFn(regex, false, null); + private final JavaScriptExtractionFn javascriptExtractionFn = new JavaScriptExtractionFn(function, true); + private final SubstringDimExtractionFn substringDimExtractionFn = new SubstringDimExtractionFn(0, 7); + private final String regexDimExtractionFnJson = "{ \"type\" : \"regex\", \"expr\" : \"/([^/]+)/\" , " + + "\"replaceMissingValues\": false, \"replaceMissingValuesWith\": null}"; + private final String javascriptExtractionFnJson = + "{ \"type\" : \"javascript\", \"function\" : \"function(str) { return \\\"the \\\".concat(str) }\" }"; + private final String substringDimExtractionFnJson = "{ \"type\" : \"substring\", \"index\" : 0, \"length\" : 7 }"; + + @Test + public void testRegexAndJavascriptAndSubstring() + { + ExtractionFn[] fns = new ExtractionFn[3]; + fns[0] = regexDimExtractionFn; + fns[1] = javascriptExtractionFn; + fns[2] = substringDimExtractionFn; + + CascadeExtractionFn cascadeExtractionFn = new CascadeExtractionFn(fns); + + Set extracted = Sets.newLinkedHashSet(); + for (String path: paths) { + extracted.add(cascadeExtractionFn.apply(path)); + } + + Set expected = Sets.newLinkedHashSet(ImmutableList.of("the dru", "the das")); + Assert.assertEquals(expected, extracted); + } + + @Test + public void testGetCacheKey() + { + ExtractionFn[] fns1 = new ExtractionFn[2]; + fns1[0] = javascriptExtractionFn; + fns1[1] = regexDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1); + + ExtractionFn[] fns2 = new ExtractionFn[2]; + fns2[0] = regexDimExtractionFn; + fns2[1] = javascriptExtractionFn; + CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2); + + ExtractionFn[] fns3 = new ExtractionFn[3]; + fns3[0] = regexDimExtractionFn; + fns3[1] = javascriptExtractionFn; + fns3[2] = substringDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3); + + + Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn2.getCacheKey())); + Assert.assertFalse(Arrays.equals(cascadeExtractionFn1.getCacheKey(), cascadeExtractionFn3.getCacheKey())); + Assert.assertFalse(Arrays.equals(cascadeExtractionFn2.getCacheKey(), cascadeExtractionFn3.getCacheKey())); + } + + @Test + public void testHashCode() + { + ExtractionFn[] fns1 = new ExtractionFn[2]; + fns1[0] = javascriptExtractionFn; + fns1[1] = regexDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1); + + ExtractionFn[] fns2 = new ExtractionFn[2]; + fns2[0] = regexDimExtractionFn; + fns2[1] = javascriptExtractionFn; + CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2); + + ExtractionFn[] fns3 = new ExtractionFn[3]; + fns3[0] = regexDimExtractionFn; + fns3[1] = javascriptExtractionFn; + fns3[2] = substringDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3); + + Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn2.hashCode()); + Assert.assertNotEquals(cascadeExtractionFn1.hashCode(), cascadeExtractionFn3.hashCode()); + Assert.assertNotEquals(cascadeExtractionFn2.hashCode(), cascadeExtractionFn3.hashCode()); + } + + @Test + public void testPreservesOrdering() + { + ExtractionFn[] fns1 = new ExtractionFn[1]; + fns1[0] = substringDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1); + + ExtractionFn[] fns2 = new ExtractionFn[2]; + fns2[0] = regexDimExtractionFn; + fns2[1] = substringDimExtractionFn; + CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2); + + ExtractionFn[] fns3 = new ExtractionFn[2]; + fns3[0] = substringDimExtractionFn; + fns3[1] = javascriptExtractionFn; + CascadeExtractionFn cascadeExtractionFn3 = new CascadeExtractionFn(fns3); + + Assert.assertTrue(cascadeExtractionFn1.preservesOrdering()); + Assert.assertFalse(cascadeExtractionFn2.preservesOrdering()); + Assert.assertFalse(cascadeExtractionFn3.preservesOrdering()); + } + + @Test + public void testGetExtractionType() + { + ExtractionFn[] fns1 = new ExtractionFn[1]; + fns1[0] = javascriptExtractionFn; + CascadeExtractionFn cascadeExtractionFn1 = new CascadeExtractionFn(fns1); + + ExtractionFn[] fns2 = new ExtractionFn[2]; + fns2[0] = regexDimExtractionFn; + fns2[1] = javascriptExtractionFn; + CascadeExtractionFn cascadeExtractionFn2 = new CascadeExtractionFn(fns2); + + Assert.assertTrue(cascadeExtractionFn1.getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE); + Assert.assertTrue(cascadeExtractionFn2.getExtractionType() == ExtractionFn.ExtractionType.MANY_TO_ONE); + } + + @Test + public void testSerde() throws Exception + { + final ObjectMapper objectMapper = new DefaultObjectMapper(); + + final String json = "{\"type\" : \"cascade\", \"extractionFns\": ["+ + regexDimExtractionFnJson + "," + javascriptExtractionFnJson + "," + substringDimExtractionFnJson + "]}"; + + CascadeExtractionFn cascadeExtractionFn = (CascadeExtractionFn) objectMapper.readValue(json, ExtractionFn.class); + RegexDimExtractionFn regexDimExtractionFn = + (RegexDimExtractionFn) objectMapper.readValue(regexDimExtractionFnJson, ExtractionFn.class); + JavaScriptExtractionFn javascriptExtractionFn = + (JavaScriptExtractionFn) objectMapper.readValue(javascriptExtractionFnJson, ExtractionFn.class); + SubstringDimExtractionFn substringDimExtractionFn = + (SubstringDimExtractionFn) objectMapper.readValue(substringDimExtractionFnJson, ExtractionFn.class); + + Assert.assertEquals(regexDimExtractionFn, cascadeExtractionFn.getExtractionFns()[0]); + Assert.assertEquals(javascriptExtractionFn, cascadeExtractionFn.getExtractionFns()[1]); + Assert.assertEquals(substringDimExtractionFn, cascadeExtractionFn.getExtractionFns()[2]); + + Assert.assertEquals( + cascadeExtractionFn, + objectMapper.readValue( + objectMapper.writeValueAsBytes(cascadeExtractionFn), + ExtractionFn.class + ) + ); + } +}