Merge pull request #2075 from jon-wei/regex_extract

Configurable value replacement on match failure for RegexExtractionFn
This commit is contained in:
Fangjin Yang 2015-12-14 19:10:50 -08:00
commit e7f06cf61c
5 changed files with 127 additions and 33 deletions

View File

@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
If there is no match, it returns the dimension value as is. If there is no match, it returns the dimension value as is.
```json ```json
{ "type" : "regex", "expr" : <regular_expression> } {
"type" : "regex", "expr" : <regular_expression>,
"replaceMissingValues" : true,
"replaceMissingValuesWith" : "foobar"
}
``` ```
For example, using `"expr" : "(\\w\\w\\w).*"` will transform For example, using `"expr" : "(\\w\\w\\w).*"` will transform
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`. `'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.
If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.
The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.
For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.
### Partial Extraction Function ### Partial Extraction Function
Returns the dimension value unchanged if the regular expression matches, otherwise returns null. Returns the dimension value unchanged if the regular expression matches, otherwise returns null.

View File

@ -34,28 +34,53 @@ import java.util.regex.Pattern;
public class RegexDimExtractionFn extends DimExtractionFn public class RegexDimExtractionFn extends DimExtractionFn
{ {
private static final byte CACHE_TYPE_ID = 0x1; private static final byte CACHE_TYPE_ID = 0x1;
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;
private final String expr; private final String expr;
private final Pattern pattern; private final Pattern pattern;
private final boolean replaceMissingValues;
private final String replaceMissingValuesWith;
@JsonCreator @JsonCreator
public RegexDimExtractionFn( public RegexDimExtractionFn(
@JsonProperty("expr") String expr @JsonProperty("expr") String expr,
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
) )
{ {
Preconditions.checkNotNull(expr, "expr must not be null"); Preconditions.checkNotNull(expr, "expr must not be null");
this.expr = expr; this.expr = expr;
this.pattern = Pattern.compile(expr); this.pattern = Pattern.compile(expr);
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
this.replaceMissingValuesWith = replaceMissingValuesWith;
} }
@Override @Override
public byte[] getCacheKey() public byte[] getCacheKey()
{ {
byte[] exprBytes = StringUtils.toUtf8(expr); byte[] exprBytes = StringUtils.toUtf8(expr);
return ByteBuffer.allocate(1 + exprBytes.length) byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
byte[] replaceStrBytes;
if (replaceMissingValuesWith == null) {
replaceStrBytes = new byte[]{};
} else {
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
}
int totalLen = 1
+ exprBytes.length
+ replaceBytes.length
+ replaceStrBytes.length; // fields
totalLen += 2; // separators
return ByteBuffer.allocate(totalLen)
.put(CACHE_TYPE_ID) .put(CACHE_TYPE_ID)
.put(exprBytes) .put(exprBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceStrBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceBytes)
.array(); .array();
} }
@ -65,8 +90,14 @@ public class RegexDimExtractionFn extends DimExtractionFn
if (dimValue == null) { if (dimValue == null) {
return null; return null;
} }
String retVal;
Matcher matcher = pattern.matcher(dimValue); Matcher matcher = pattern.matcher(dimValue);
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue); if (matcher.find()) {
retVal = matcher.group(1);
} else {
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
}
return Strings.emptyToNull(retVal);
} }
@JsonProperty("expr") @JsonProperty("expr")
@ -75,6 +106,18 @@ public class RegexDimExtractionFn extends DimExtractionFn
return expr; return expr;
} }
@JsonProperty("replaceMissingValues")
public boolean isReplaceMissingValues()
{
return replaceMissingValues;
}
@JsonProperty("replaceMissingValuesWith")
public String getReplaceMissingValuesWith()
{
return replaceMissingValuesWith;
}
@Override @Override
public boolean preservesOrdering() public boolean preservesOrdering()
{ {

View File

@ -20,6 +20,7 @@
package io.druid.query.extraction; package io.druid.query.extraction;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import io.druid.jackson.DefaultObjectMapper; import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert; import org.junit.Assert;
@ -55,51 +56,50 @@ public class RegexDimExtractionFnTest
public void testPathExtraction() public void testPathExtraction()
{ {
String regex = "/([^/]+)/"; String regex = "/([^/]+)/";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex); ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newHashSet(); Set<String> extracted = Sets.newLinkedHashSet();
for (String path : paths) { for (String path : paths) {
extracted.add(extractionFn.apply(path)); extracted.add(extractionFn.apply(path));
} }
Assert.assertEquals(2, extracted.size()); Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
Assert.assertTrue(extracted.contains("druid")); Assert.assertEquals(expected, extracted);
Assert.assertTrue(extracted.contains("dash"));
} }
@Test @Test
public void testDeeperPathExtraction() public void testDeeperPathExtraction()
{ {
String regex = "^/([^/]+/[^/]+)(/|$)"; String regex = "^/([^/]+/[^/]+)(/|$)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex); ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newHashSet(); Set<String> extracted = Sets.newLinkedHashSet();
for (String path : paths) { for (String path : paths) {
extracted.add(extractionFn.apply(path)); extracted.add(extractionFn.apply(path));
} }
Assert.assertEquals(4, extracted.size()); Set<String> expected = Sets.newLinkedHashSet(
Assert.assertTrue(extracted.contains("druid/prod")); ImmutableList.of(
Assert.assertTrue(extracted.contains("druid/demo")); "druid/prod", "druid/demo",
Assert.assertTrue(extracted.contains("dash/aloe")); "dash/aloe", "dash/baloo"
Assert.assertTrue(extracted.contains("dash/baloo")); )
);
Assert.assertEquals(expected, extracted);
} }
@Test @Test
public void testStringExtraction() public void testStringExtraction()
{ {
String regex = "(.)"; String regex = "(.)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex); ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newHashSet(); Set<String> extracted = Sets.newLinkedHashSet();
for (String testString : testStrings) { for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString)); extracted.add(extractionFn.apply(testString));
} }
Assert.assertEquals(3, extracted.size()); Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
Assert.assertTrue(extracted.contains("a")); Assert.assertEquals(expected, extracted);
Assert.assertTrue(extracted.contains("b"));
Assert.assertTrue(extracted.contains("c"));
} }
@ -107,7 +107,7 @@ public class RegexDimExtractionFnTest
public void testNullAndEmpty() public void testNullAndEmpty()
{ {
String regex = "(.*)/.*/.*"; String regex = "(.*)/.*/.*";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex); ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
// no match, map empty input value to null // no match, map empty input value to null
Assert.assertEquals(null, extractionFn.apply("")); Assert.assertEquals(null, extractionFn.apply(""));
// null value, returns null // null value, returns null
@ -116,14 +116,54 @@ public class RegexDimExtractionFnTest
Assert.assertEquals(null, extractionFn.apply("/a/b")); Assert.assertEquals(null, extractionFn.apply("/a/b"));
} }
@Test
public void testMissingValueReplacement()
{
String regex = "(a\\w*)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
Set<String> extracted = Sets.newLinkedHashSet();
for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
Assert.assertEquals(expected, extracted);
byte[] cacheKey = extractionFn.getCacheKey();
byte[] expectedCacheKey = new byte[]{
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
Set<String> extracted2 = Sets.newLinkedHashSet();
for (String testString : testStrings) {
extracted2.add(nullExtractionFn.apply(testString));
}
Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
expected2.add(null);
Assert.assertEquals(expected2, extracted2);
cacheKey = nullExtractionFn.getCacheKey();
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
}
@Test @Test
public void testSerde() throws Exception public void testSerde() throws Exception
{ {
final ObjectMapper objectMapper = new DefaultObjectMapper(); final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }"; final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class); RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
Assert.assertEquals(".(...)?", extractionFn.getExpr()); Assert.assertEquals(".(...)?", extractionFn.getExpr());
Assert.assertTrue(extractionFn.isReplaceMissingValues());
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());
// round trip // round trip
Assert.assertEquals( Assert.assertEquals(

View File

@ -730,7 +730,7 @@ public class GroupByQueryRunnerTest
@Test @Test
public void testGroupByWithNullProducingDimExtractionFn() public void testGroupByWithNullProducingDimExtractionFn()
{ {
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})") final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{ {
@Override @Override
public byte[] getCacheKey() public byte[] getCacheKey()
@ -797,7 +797,7 @@ public class GroupByQueryRunnerTest
*/ */
public void testGroupByWithEmptyStringProducingDimExtractionFn() public void testGroupByWithEmptyStringProducingDimExtractionFn()
{ {
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})") final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{ {
@Override @Override
public byte[] getCacheKey() public byte[] getCacheKey()

View File

@ -1519,7 +1519,7 @@ public class TopNQueryRunnerTest
.dimension( .dimension(
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
new RegexDimExtractionFn(".(.)"), null new RegexDimExtractionFn(".(.)", false, null), null
) )
) )
.metric("index") .metric("index")
@ -1568,7 +1568,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"), new RegexDimExtractionFn("(.)", false, null),
null null
) )
) )
@ -2074,7 +2074,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"), new RegexDimExtractionFn("(.)", false, null),
null null
) )
) )
@ -2128,7 +2128,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"), new RegexDimExtractionFn("..(.)", false, null),
null null
) )
) )
@ -2182,7 +2182,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"), new RegexDimExtractionFn("(.)", false, null),
null null
) )
) )
@ -2300,7 +2300,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"), new RegexDimExtractionFn("(.)", false, null),
null null
) )
) )
@ -2347,7 +2347,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec( new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension, QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"), new RegexDimExtractionFn("..(.)", false, null),
null null
) )
) )