Merge pull request #2285 from fjy/stringformat

fixed #1873, add ability to express CONCAT as an extractionFn
This commit is contained in:
Fangjin Yang 2016-01-19 11:23:12 -08:00
commit 1b359d6a47
15 changed files with 289 additions and 64 deletions

View File

@ -341,6 +341,16 @@ Example for chaining [regular expression extraction function](#regular-expressio
It will transform dimension values with specified extraction functions in the order named. It will transform dimension values with specified extraction functions in the order named.
For example, `'/druid/prod/historical'` is transformed to `'the dru'` as regular expression extraction function first transforms it to `'druid'` and then, javascript extraction function transforms it to `'the druid'`, and lastly, substring extraction function transforms it to `'the dru'`. For example, `'/druid/prod/historical'` is transformed to `'the dru'` as regular expression extraction function first transforms it to `'druid'` and then, javascript extraction function transforms it to `'the druid'`, and lastly, substring extraction function transforms it to `'the dru'`.
### String Format Extraction Function
Returns the dimension value formatted according to the given format string.
```json
{ "type" : "stringFormat", "format" : <sprintf_expression> }
```
For example if you want to concat "[" and "]" before and after the actual dimension value, you need to specify "[%s]" as format string.
### Filtering DimensionSpecs ### Filtering DimensionSpecs
These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases. These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases.

View File

@ -29,38 +29,44 @@ import java.util.Arrays;
public class CascadeExtractionFn implements ExtractionFn public class CascadeExtractionFn implements ExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x9;
private final ExtractionFn extractionFns[]; private final ExtractionFn extractionFns[];
private final ChainedExtractionFn chainedExtractionFn; private final ChainedExtractionFn chainedExtractionFn;
private final ChainedExtractionFn DEFAULT_CHAINED_EXTRACTION_FN = new ChainedExtractionFn( private final ChainedExtractionFn DEFAULT_CHAINED_EXTRACTION_FN = new ChainedExtractionFn(
new ExtractionFn() { new ExtractionFn()
public byte[] getCacheKey() { {
public byte[] getCacheKey()
{
return new byte[0]; return new byte[0];
} }
public String apply(Object value) { public String apply(Object value)
{
return null; return null;
} }
public String apply(String value) { public String apply(String value)
{
return null; return null;
} }
public String apply(long value) { public String apply(long value)
{
return null; return null;
} }
public boolean preservesOrdering() { public boolean preservesOrdering()
{
return false; return false;
} }
public ExtractionType getExtractionType() { public ExtractionType getExtractionType()
{
return ExtractionType.MANY_TO_ONE; return ExtractionType.MANY_TO_ONE;
} }
@Override @Override
public String toString() { public String toString()
{
return "nullExtractionFn{}"; return "nullExtractionFn{}";
} }
}, },
@ -78,48 +84,55 @@ public class CascadeExtractionFn implements ExtractionFn
this.chainedExtractionFn = DEFAULT_CHAINED_EXTRACTION_FN; this.chainedExtractionFn = DEFAULT_CHAINED_EXTRACTION_FN;
} else { } else {
ChainedExtractionFn root = null; ChainedExtractionFn root = null;
for (int idx = 0; idx < extractionFns.length; idx++) { for (ExtractionFn fn : extractionFn) {
Preconditions.checkArgument(extractionFns[idx] != null, "empty function is not allowed"); Preconditions.checkArgument(fn != null, "empty function is not allowed");
root = new ChainedExtractionFn(extractionFns[idx], root); root = new ChainedExtractionFn(fn, root);
} }
this.chainedExtractionFn = root; this.chainedExtractionFn = root;
} }
} }
@JsonProperty @JsonProperty
public ExtractionFn[] getExtractionFns() { public ExtractionFn[] getExtractionFns()
{
return extractionFns; return extractionFns;
} }
@Override @Override
public byte[] getCacheKey() { public byte[] getCacheKey()
byte[] cacheKey = new byte[] {CACHE_TYPE_ID}; {
byte[] cacheKey = new byte[]{ExtractionCacheHelper.CACHE_TYPE_ID_CASCADE};
return Bytes.concat(cacheKey, chainedExtractionFn.getCacheKey()); return Bytes.concat(cacheKey, chainedExtractionFn.getCacheKey());
} }
@Override @Override
public String apply(Object value) { public String apply(Object value)
{
return chainedExtractionFn.apply(value); return chainedExtractionFn.apply(value);
} }
@Override @Override
public String apply(String value){ public String apply(String value)
{
return chainedExtractionFn.apply(value); return chainedExtractionFn.apply(value);
} }
@Override @Override
public String apply(long value){ public String apply(long value)
{
return chainedExtractionFn.apply(value); return chainedExtractionFn.apply(value);
} }
@Override @Override
public boolean preservesOrdering(){ public boolean preservesOrdering()
{
return chainedExtractionFn.preservesOrdering(); return chainedExtractionFn.preservesOrdering();
} }
@Override @Override
public ExtractionType getExtractionType(){ public ExtractionType getExtractionType()
{
return chainedExtractionFn.getExtractionType(); return chainedExtractionFn.getExtractionType();
} }
@ -152,44 +165,53 @@ public class CascadeExtractionFn implements ExtractionFn
} }
@Override @Override
public String toString() { public String toString()
{
return "CascadeExtractionFn{" + return "CascadeExtractionFn{" +
"extractionFns=[" + chainedExtractionFn.toString() + "]}"; "extractionFns=[" + chainedExtractionFn.toString() + "]}";
} }
private class ChainedExtractionFn { private class ChainedExtractionFn
{
private final ExtractionFn fn; private final ExtractionFn fn;
private final ChainedExtractionFn child; private final ChainedExtractionFn child;
public ChainedExtractionFn(ExtractionFn fn, ChainedExtractionFn child) { public ChainedExtractionFn(ExtractionFn fn, ChainedExtractionFn child)
{
this.fn = fn; this.fn = fn;
this.child = child; this.child = child;
} }
public byte[] getCacheKey() { public byte[] getCacheKey()
{
byte[] fnCacheKey = fn.getCacheKey(); byte[] fnCacheKey = fn.getCacheKey();
return (child != null) ? Bytes.concat(fnCacheKey, child.getCacheKey()) : fnCacheKey; return (child != null) ? Bytes.concat(fnCacheKey, child.getCacheKey()) : fnCacheKey;
} }
public String apply(Object value) { public String apply(Object value)
{
return fn.apply((child != null) ? child.apply(value) : value); return fn.apply((child != null) ? child.apply(value) : value);
} }
public String apply(String value){ public String apply(String value)
{
return fn.apply((child != null) ? child.apply(value) : value); return fn.apply((child != null) ? child.apply(value) : value);
} }
public String apply(long value){ public String apply(long value)
{
return fn.apply((child != null) ? child.apply(value) : value); return fn.apply((child != null) ? child.apply(value) : value);
} }
public boolean preservesOrdering(){ public boolean preservesOrdering()
{
boolean childPreservesOrdering = (child == null) || child.preservesOrdering(); boolean childPreservesOrdering = (child == null) || child.preservesOrdering();
return fn.preservesOrdering() && childPreservesOrdering; return fn.preservesOrdering() && childPreservesOrdering;
} }
public ExtractionType getExtractionType(){ public ExtractionType getExtractionType()
{
if (child != null && child.getExtractionType() == ExtractionType.MANY_TO_ONE) { if (child != null && child.getExtractionType() == ExtractionType.MANY_TO_ONE) {
return ExtractionType.MANY_TO_ONE; return ExtractionType.MANY_TO_ONE;
} else { } else {
@ -227,7 +249,8 @@ public class CascadeExtractionFn implements ExtractionFn
return result; return result;
} }
public String toString() { public String toString()
{
return (child != null) return (child != null)
? Joiner.on(",").join(child.toString(), fn.toString()) ? Joiner.on(",").join(child.toString(), fn.toString())
: fn.toString(); : fn.toString();

View File

@ -24,7 +24,7 @@ public abstract class DimExtractionFn implements ExtractionFn
@Override @Override
public String apply(Object value) public String apply(Object value)
{ {
return apply(value.toString()); return apply(value == null ? null : value.toString());
} }
@Override @Override

View File

@ -0,0 +1,38 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.query.extraction;
/**
*
*/
public class ExtractionCacheHelper
{
public static final byte CACHE_TYPE_ID_TIME_DIM = 0x0;
public static final byte CACHE_TYPE_ID_REGEX = 0x1;
public static final byte CACHE_TYPE_ID_MATCHING_DIM = 0x2;
public static final byte CACHE_TYPE_ID_SEARCH_QUERY = 0x3;
public static final byte CACHE_TYPE_ID_JAVASCRIPT = 0x4;
public static final byte CACHE_TYPE_ID_TIME_FORMAT = 0x5;
public static final byte CACHE_TYPE_ID_IDENTITY = 0x6;
public static final byte CACHE_TYPE_ID_LOOKUP = 0x7;
public static final byte CACHE_TYPE_ID_SUBSTRING = 0x8;
public static final byte CACHE_TYPE_ID_CASCADE = 0x9;
public static final byte CACHE_TYPE_ID_STRING_FORMAT = 0xA;
}

View File

@ -35,7 +35,8 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo;
@JsonSubTypes.Type(name = "identity", value = IdentityExtractionFn.class), @JsonSubTypes.Type(name = "identity", value = IdentityExtractionFn.class),
@JsonSubTypes.Type(name = "lookup", value = LookupExtractionFn.class), @JsonSubTypes.Type(name = "lookup", value = LookupExtractionFn.class),
@JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class), @JsonSubTypes.Type(name = "substring", value = SubstringDimExtractionFn.class),
@JsonSubTypes.Type(name = "cascade", value = CascadeExtractionFn.class) @JsonSubTypes.Type(name = "cascade", value = CascadeExtractionFn.class),
@JsonSubTypes.Type(name = "stringFormat", value = StringFormatExtractionFn.class)
}) })
/** /**
* An ExtractionFn is a function that can be used to transform the values of a column (typically a dimension) * An ExtractionFn is a function that can be used to transform the values of a column (typically a dimension)
@ -56,7 +57,7 @@ public interface ExtractionFn
/** /**
* The "extraction" function. This should map a value into some other String value. * The "extraction" function. This should map a value into some other String value.
* * <p>
* In order to maintain the "null and empty string are equivalent" semantics that Druid provides, the * In order to maintain the "null and empty string are equivalent" semantics that Druid provides, the
* empty string is considered invalid output for this method and should instead return null. This is * empty string is considered invalid output for this method and should instead return null. This is
* a contract on the method rather than enforced at a lower level in order to eliminate a global check * a contract on the method rather than enforced at a lower level in order to eliminate a global check
@ -74,7 +75,7 @@ public interface ExtractionFn
/** /**
* Offers information on whether the extraction will preserve the original ordering of the values. * Offers information on whether the extraction will preserve the original ordering of the values.
* <p/> * <p>
* Some optimizations of queries is possible if ordering is preserved. Null values *do* count towards * Some optimizations of queries is possible if ordering is preserved. Null values *do* count towards
* ordering. * ordering.
* *

View File

@ -23,8 +23,6 @@ import com.google.common.base.Strings;
public class IdentityExtractionFn implements ExtractionFn public class IdentityExtractionFn implements ExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x6;
private static final IdentityExtractionFn instance = new IdentityExtractionFn(); private static final IdentityExtractionFn instance = new IdentityExtractionFn();
private IdentityExtractionFn() private IdentityExtractionFn()
@ -35,7 +33,7 @@ public class IdentityExtractionFn implements ExtractionFn
@Override @Override
public byte[] getCacheKey() public byte[] getCacheKey()
{ {
return new byte[]{CACHE_TYPE_ID}; return new byte[]{ExtractionCacheHelper.CACHE_TYPE_ID_IDENTITY};
} }
@Override @Override

View File

@ -61,8 +61,6 @@ public class JavaScriptExtractionFn implements ExtractionFn
}; };
} }
private static final byte CACHE_TYPE_ID = 0x4;
private final String function; private final String function;
private final Function<Object, String> fn; private final Function<Object, String> fn;
private final boolean injective; private final boolean injective;
@ -97,7 +95,7 @@ public class JavaScriptExtractionFn implements ExtractionFn
{ {
byte[] bytes = StringUtils.toUtf8(function); byte[] bytes = StringUtils.toUtf8(function);
return ByteBuffer.allocate(1 + bytes.length) return ByteBuffer.allocate(1 + bytes.length)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_JAVASCRIPT)
.put(bytes) .put(bytes)
.array(); .array();
} }

View File

@ -33,8 +33,6 @@ import java.io.IOException;
public class LookupExtractionFn extends FunctionalExtraction public class LookupExtractionFn extends FunctionalExtraction
{ {
private static final byte CACHE_TYPE_ID = 0x7;
private final LookupExtractor lookup; private final LookupExtractor lookup;
private final boolean optimize; private final boolean optimize;
@ -98,7 +96,7 @@ public class LookupExtractionFn extends FunctionalExtraction
{ {
try { try {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
outputStream.write(CACHE_TYPE_ID); outputStream.write(ExtractionCacheHelper.CACHE_TYPE_ID_LOOKUP);
outputStream.write(lookup.getCacheKey()); outputStream.write(lookup.getCacheKey());
if (getReplaceMissingValueWith() != null) { if (getReplaceMissingValueWith() != null) {
outputStream.write(StringUtils.toUtf8(getReplaceMissingValueWith())); outputStream.write(StringUtils.toUtf8(getReplaceMissingValueWith()));

View File

@ -32,8 +32,6 @@ import java.util.regex.Pattern;
*/ */
public class MatchingDimExtractionFn extends DimExtractionFn public class MatchingDimExtractionFn extends DimExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x2;
private final String expr; private final String expr;
private final Pattern pattern; private final Pattern pattern;
@ -53,7 +51,7 @@ public class MatchingDimExtractionFn extends DimExtractionFn
{ {
byte[] exprBytes = StringUtils.toUtf8(expr); byte[] exprBytes = StringUtils.toUtf8(expr);
return ByteBuffer.allocate(1 + exprBytes.length) return ByteBuffer.allocate(1 + exprBytes.length)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_MATCHING_DIM)
.put(exprBytes) .put(exprBytes)
.array(); .array();
} }

View File

@ -33,7 +33,6 @@ import java.util.regex.Pattern;
*/ */
public class RegexDimExtractionFn extends DimExtractionFn public class RegexDimExtractionFn extends DimExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x1;
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF; private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;
private final String expr; private final String expr;
@ -75,7 +74,7 @@ public class RegexDimExtractionFn extends DimExtractionFn
totalLen += 2; // separators totalLen += 2; // separators
return ByteBuffer.allocate(totalLen) return ByteBuffer.allocate(totalLen)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_REGEX)
.put(exprBytes) .put(exprBytes)
.put(CACHE_KEY_SEPARATOR) .put(CACHE_KEY_SEPARATOR)
.put(replaceStrBytes) .put(replaceStrBytes)

View File

@ -30,8 +30,6 @@ import java.nio.ByteBuffer;
*/ */
public class SearchQuerySpecDimExtractionFn extends DimExtractionFn public class SearchQuerySpecDimExtractionFn extends DimExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x3;
private final SearchQuerySpec searchQuerySpec; private final SearchQuerySpec searchQuerySpec;
@JsonCreator @JsonCreator
@ -55,7 +53,7 @@ public class SearchQuerySpecDimExtractionFn extends DimExtractionFn
{ {
byte[] specBytes = searchQuerySpec.getCacheKey(); byte[] specBytes = searchQuerySpec.getCacheKey();
return ByteBuffer.allocate(1 + specBytes.length) return ByteBuffer.allocate(1 + specBytes.length)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_SEARCH_QUERY)
.put(specBytes) .put(specBytes)
.array(); .array();
} }

View File

@ -0,0 +1,101 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.query.extraction;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.metamx.common.StringUtils;
import java.nio.ByteBuffer;
/**
*
*/
public class StringFormatExtractionFn extends DimExtractionFn
{
private final String format;
@JsonCreator
public StringFormatExtractionFn(
@JsonProperty("format") String format
)
{
Preconditions.checkArgument(!Strings.isNullOrEmpty(format), "format string should not be empty");
this.format = format;
}
@JsonProperty
public String getFormat()
{
return format;
}
@Override
public byte[] getCacheKey()
{
byte[] bytes = StringUtils.toUtf8(format);
return ByteBuffer.allocate(1 + bytes.length)
.put(ExtractionCacheHelper.CACHE_TYPE_ID_STRING_FORMAT)
.put(bytes)
.array();
}
@Override
public String apply(String value)
{
return String.format(format, value);
}
@Override
public boolean preservesOrdering()
{
return false;
}
@Override
public ExtractionType getExtractionType()
{
return ExtractionType.MANY_TO_ONE;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
StringFormatExtractionFn that = (StringFormatExtractionFn) o;
return format.equals(that.format);
}
@Override
public int hashCode()
{
return format.hashCode();
}
}

View File

@ -33,8 +33,6 @@ import java.util.Date;
*/ */
public class TimeDimExtractionFn extends DimExtractionFn public class TimeDimExtractionFn extends DimExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x0;
private final String timeFormat; private final String timeFormat;
private final SimpleDateFormat timeFormatter; private final SimpleDateFormat timeFormatter;
private final String resultFormat; private final String resultFormat;
@ -62,7 +60,7 @@ public class TimeDimExtractionFn extends DimExtractionFn
{ {
byte[] timeFormatBytes = StringUtils.toUtf8(timeFormat); byte[] timeFormatBytes = StringUtils.toUtf8(timeFormat);
return ByteBuffer.allocate(1 + timeFormatBytes.length) return ByteBuffer.allocate(1 + timeFormatBytes.length)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_TIME_DIM)
.put(timeFormatBytes) .put(timeFormatBytes)
.array(); .array();
} }

View File

@ -32,8 +32,6 @@ import java.util.Locale;
public class TimeFormatExtractionFn implements ExtractionFn public class TimeFormatExtractionFn implements ExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x5;
private final DateTimeZone tz; private final DateTimeZone tz;
private final String pattern; private final String pattern;
private final Locale locale; private final Locale locale;
@ -82,7 +80,7 @@ public class TimeFormatExtractionFn implements ExtractionFn
{ {
byte[] exprBytes = StringUtils.toUtf8(pattern + "\u0001" + tz.getID() + "\u0001" + locale.toLanguageTag()); byte[] exprBytes = StringUtils.toUtf8(pattern + "\u0001" + tz.getID() + "\u0001" + locale.toLanguageTag());
return ByteBuffer.allocate(1 + exprBytes.length) return ByteBuffer.allocate(1 + exprBytes.length)
.put(CACHE_TYPE_ID) .put(ExtractionCacheHelper.CACHE_TYPE_ID_TIME_FORMAT)
.put(exprBytes) .put(exprBytes)
.array(); .array();
} }

View File

@ -0,0 +1,67 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.query.extraction;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert;
import org.junit.Test;
/**
*
*/
public class StringFormatExtractionFnTest
{
@Test
public void testApply() throws Exception
{
StringFormatExtractionFn fn = new StringFormatExtractionFn("[%s]");
long test = 1000L;
Assert.assertEquals("[1000]", fn.apply(test));
}
@Test
public void testApplyNull() throws Exception
{
StringFormatExtractionFn fn = new StringFormatExtractionFn("[%s]");
String test = null;
Assert.assertEquals("[null]", fn.apply(test));
}
@Test
public void testSerde() throws Exception
{
final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{ \"type\" : \"stringFormat\", \"format\" : \"[%s]\" }";
StringFormatExtractionFn extractionFn = (StringFormatExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
Assert.assertEquals("[%s]", extractionFn.getFormat());
// round trip
Assert.assertEquals(
extractionFn,
objectMapper.readValue(
objectMapper.writeValueAsBytes(extractionFn),
ExtractionFn.class
)
);
}
}