diff --git a/docs/content/querying/dimensionspecs.md b/docs/content/querying/dimensionspecs.md index 8d16083ce3b..d4e5a335091 100644 --- a/docs/content/querying/dimensionspecs.md +++ b/docs/content/querying/dimensionspecs.md @@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression. If there is no match, it returns the dimension value as is. ```json -{ "type" : "regex", "expr" : } +{ + "type" : "regex", "expr" : , + "replaceMissingValues" : true, + "replaceMissingValuesWith" : "foobar" +} ``` For example, using `"expr" : "(\\w\\w\\w).*"` will transform `'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`. +If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`. + +The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls. + +For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`. + + ### Partial Extraction Function Returns the dimension value unchanged if the regular expression matches, otherwise returns null. diff --git a/processing/src/main/java/io/druid/query/extraction/RegexDimExtractionFn.java b/processing/src/main/java/io/druid/query/extraction/RegexDimExtractionFn.java index e2ba035c70f..b86c84a473d 100644 --- a/processing/src/main/java/io/druid/query/extraction/RegexDimExtractionFn.java +++ b/processing/src/main/java/io/druid/query/extraction/RegexDimExtractionFn.java @@ -34,28 +34,53 @@ import java.util.regex.Pattern; public class RegexDimExtractionFn extends DimExtractionFn { private static final byte CACHE_TYPE_ID = 0x1; + private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF; private final String expr; private final Pattern pattern; + private final boolean replaceMissingValues; + private final String replaceMissingValuesWith; @JsonCreator public RegexDimExtractionFn( - @JsonProperty("expr") String expr + @JsonProperty("expr") String expr, + @JsonProperty("replaceMissingValues") Boolean replaceMissingValues, + @JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith ) { Preconditions.checkNotNull(expr, "expr must not be null"); this.expr = expr; this.pattern = Pattern.compile(expr); + this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues; + this.replaceMissingValuesWith = replaceMissingValuesWith; } @Override public byte[] getCacheKey() { byte[] exprBytes = StringUtils.toUtf8(expr); - return ByteBuffer.allocate(1 + exprBytes.length) + byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0}; + byte[] replaceStrBytes; + if (replaceMissingValuesWith == null) { + replaceStrBytes = new byte[]{}; + } else { + replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith); + } + + int totalLen = 1 + + exprBytes.length + + replaceBytes.length + + replaceStrBytes.length; // fields + totalLen += 2; // separators + + return ByteBuffer.allocate(totalLen) .put(CACHE_TYPE_ID) .put(exprBytes) + .put(CACHE_KEY_SEPARATOR) + .put(replaceStrBytes) + .put(CACHE_KEY_SEPARATOR) + .put(replaceBytes) .array(); } @@ -65,8 +90,14 @@ public class RegexDimExtractionFn extends DimExtractionFn if (dimValue == null) { return null; } + String retVal; Matcher matcher = pattern.matcher(dimValue); - return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue); + if (matcher.find()) { + retVal = matcher.group(1); + } else { + retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue; + } + return Strings.emptyToNull(retVal); } @JsonProperty("expr") @@ -75,6 +106,18 @@ public class RegexDimExtractionFn extends DimExtractionFn return expr; } + @JsonProperty("replaceMissingValues") + public boolean isReplaceMissingValues() + { + return replaceMissingValues; + } + + @JsonProperty("replaceMissingValuesWith") + public String getReplaceMissingValuesWith() + { + return replaceMissingValuesWith; + } + @Override public boolean preservesOrdering() { diff --git a/processing/src/test/java/io/druid/query/extraction/RegexDimExtractionFnTest.java b/processing/src/test/java/io/druid/query/extraction/RegexDimExtractionFnTest.java index 9171556fdbc..d469d2241fb 100644 --- a/processing/src/test/java/io/druid/query/extraction/RegexDimExtractionFnTest.java +++ b/processing/src/test/java/io/druid/query/extraction/RegexDimExtractionFnTest.java @@ -20,6 +20,7 @@ package io.druid.query.extraction; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import io.druid.jackson.DefaultObjectMapper; import org.junit.Assert; @@ -55,51 +56,50 @@ public class RegexDimExtractionFnTest public void testPathExtraction() { String regex = "/([^/]+)/"; - ExtractionFn extractionFn = new RegexDimExtractionFn(regex); - Set extracted = Sets.newHashSet(); + ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null); + Set extracted = Sets.newLinkedHashSet(); for (String path : paths) { extracted.add(extractionFn.apply(path)); } - Assert.assertEquals(2, extracted.size()); - Assert.assertTrue(extracted.contains("druid")); - Assert.assertTrue(extracted.contains("dash")); + Set expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash")); + Assert.assertEquals(expected, extracted); } @Test public void testDeeperPathExtraction() { String regex = "^/([^/]+/[^/]+)(/|$)"; - ExtractionFn extractionFn = new RegexDimExtractionFn(regex); - Set extracted = Sets.newHashSet(); + ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null); + Set extracted = Sets.newLinkedHashSet(); for (String path : paths) { extracted.add(extractionFn.apply(path)); } - Assert.assertEquals(4, extracted.size()); - Assert.assertTrue(extracted.contains("druid/prod")); - Assert.assertTrue(extracted.contains("druid/demo")); - Assert.assertTrue(extracted.contains("dash/aloe")); - Assert.assertTrue(extracted.contains("dash/baloo")); + Set expected = Sets.newLinkedHashSet( + ImmutableList.of( + "druid/prod", "druid/demo", + "dash/aloe", "dash/baloo" + ) + ); + Assert.assertEquals(expected, extracted); } @Test public void testStringExtraction() { String regex = "(.)"; - ExtractionFn extractionFn = new RegexDimExtractionFn(regex); - Set extracted = Sets.newHashSet(); + ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null); + Set extracted = Sets.newLinkedHashSet(); for (String testString : testStrings) { extracted.add(extractionFn.apply(testString)); } - Assert.assertEquals(3, extracted.size()); - Assert.assertTrue(extracted.contains("a")); - Assert.assertTrue(extracted.contains("b")); - Assert.assertTrue(extracted.contains("c")); + Set expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c")); + Assert.assertEquals(expected, extracted); } @@ -107,7 +107,7 @@ public class RegexDimExtractionFnTest public void testNullAndEmpty() { String regex = "(.*)/.*/.*"; - ExtractionFn extractionFn = new RegexDimExtractionFn(regex); + ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null); // no match, map empty input value to null Assert.assertEquals(null, extractionFn.apply("")); // null value, returns null @@ -116,14 +116,54 @@ public class RegexDimExtractionFnTest Assert.assertEquals(null, extractionFn.apply("/a/b")); } + @Test + public void testMissingValueReplacement() + { + String regex = "(a\\w*)"; + ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar"); + Set extracted = Sets.newLinkedHashSet(); + + for (String testString : testStrings) { + extracted.add(extractionFn.apply(testString)); + } + + Set expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar")); + Assert.assertEquals(expected, extracted); + + byte[] cacheKey = extractionFn.getCacheKey(); + byte[] expectedCacheKey = new byte[]{ + 0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, + 0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01 + }; + Assert.assertArrayEquals(expectedCacheKey, cacheKey); + + ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null); + Set extracted2 = Sets.newLinkedHashSet(); + + for (String testString : testStrings) { + extracted2.add(nullExtractionFn.apply(testString)); + } + + Set expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum")); + expected2.add(null); + Assert.assertEquals(expected2, extracted2); + + cacheKey = nullExtractionFn.getCacheKey(); + expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01}; + Assert.assertArrayEquals(expectedCacheKey, cacheKey); + } + @Test public void testSerde() throws Exception { final ObjectMapper objectMapper = new DefaultObjectMapper(); - final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }"; + final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " + + "\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}"; RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class); Assert.assertEquals(".(...)?", extractionFn.getExpr()); + Assert.assertTrue(extractionFn.isReplaceMissingValues()); + Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith()); // round trip Assert.assertEquals( diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java index 450f9ed15e1..5d10abf9769 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java @@ -730,7 +730,7 @@ public class GroupByQueryRunnerTest @Test public void testGroupByWithNullProducingDimExtractionFn() { - final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})") + final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null) { @Override public byte[] getCacheKey() @@ -797,7 +797,7 @@ public class GroupByQueryRunnerTest */ public void testGroupByWithEmptyStringProducingDimExtractionFn() { - final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})") + final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null) { @Override public byte[] getCacheKey() diff --git a/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java b/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java index 50d51eef6b5..244b8baea43 100644 --- a/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java @@ -1519,7 +1519,7 @@ public class TopNQueryRunnerTest .dimension( new ExtractionDimensionSpec( QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension, - new RegexDimExtractionFn(".(.)"), null + new RegexDimExtractionFn(".(.)", false, null), null ) ) .metric("index") @@ -1568,7 +1568,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("(.)"), + new RegexDimExtractionFn("(.)", false, null), null ) ) @@ -2074,7 +2074,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("(.)"), + new RegexDimExtractionFn("(.)", false, null), null ) ) @@ -2128,7 +2128,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("..(.)"), + new RegexDimExtractionFn("..(.)", false, null), null ) ) @@ -2182,7 +2182,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("(.)"), + new RegexDimExtractionFn("(.)", false, null), null ) ) @@ -2300,7 +2300,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("(.)"), + new RegexDimExtractionFn("(.)", false, null), null ) ) @@ -2347,7 +2347,7 @@ public class TopNQueryRunnerTest new ExtractionDimensionSpec( QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension, - new RegexDimExtractionFn("..(.)"), + new RegexDimExtractionFn("..(.)", false, null), null ) )