Configurable value replacement on match failure for RegexExtractionFn

This commit is contained in:
jon-wei 2015-12-09 12:01:52 -08:00
parent ce79d707dd
commit c88f75df7c
5 changed files with 127 additions and 33 deletions

View File

@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
If there is no match, it returns the dimension value as is.
```json
{ "type" : "regex", "expr" : <regular_expression> }
{
"type" : "regex", "expr" : <regular_expression>,
"replaceMissingValues" : true,
"replaceMissingValuesWith" : "foobar"
}
```
For example, using `"expr" : "(\\w\\w\\w).*"` will transform
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.
If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.
The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.
For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.
### Partial Extraction Function
Returns the dimension value unchanged if the regular expression matches, otherwise returns null.

View File

@ -34,28 +34,53 @@ import java.util.regex.Pattern;
public class RegexDimExtractionFn extends DimExtractionFn
{
private static final byte CACHE_TYPE_ID = 0x1;
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;
private final String expr;
private final Pattern pattern;
private final boolean replaceMissingValues;
private final String replaceMissingValuesWith;
@JsonCreator
public RegexDimExtractionFn(
@JsonProperty("expr") String expr
@JsonProperty("expr") String expr,
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
)
{
Preconditions.checkNotNull(expr, "expr must not be null");
this.expr = expr;
this.pattern = Pattern.compile(expr);
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
this.replaceMissingValuesWith = replaceMissingValuesWith;
}
@Override
public byte[] getCacheKey()
{
byte[] exprBytes = StringUtils.toUtf8(expr);
return ByteBuffer.allocate(1 + exprBytes.length)
byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
byte[] replaceStrBytes;
if (replaceMissingValuesWith == null) {
replaceStrBytes = new byte[]{};
} else {
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
}
int totalLen = 1
+ exprBytes.length
+ replaceBytes.length
+ replaceStrBytes.length; // fields
totalLen += 2; // separators
return ByteBuffer.allocate(totalLen)
.put(CACHE_TYPE_ID)
.put(exprBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceStrBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceBytes)
.array();
}
@ -65,8 +90,14 @@ public class RegexDimExtractionFn extends DimExtractionFn
if (dimValue == null) {
return null;
}
String retVal;
Matcher matcher = pattern.matcher(dimValue);
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue);
if (matcher.find()) {
retVal = matcher.group(1);
} else {
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
}
return Strings.emptyToNull(retVal);
}
@JsonProperty("expr")
@ -75,6 +106,18 @@ public class RegexDimExtractionFn extends DimExtractionFn
return expr;
}
@JsonProperty("replaceMissingValues")
public boolean isReplaceMissingValues()
{
return replaceMissingValues;
}
@JsonProperty("replaceMissingValuesWith")
public String getReplaceMissingValuesWith()
{
return replaceMissingValuesWith;
}
@Override
public boolean preservesOrdering()
{

View File

@ -20,6 +20,7 @@
package io.druid.query.extraction;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert;
@ -55,51 +56,50 @@ public class RegexDimExtractionFnTest
public void testPathExtraction()
{
String regex = "/([^/]+)/";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();
for (String path : paths) {
extracted.add(extractionFn.apply(path));
}
Assert.assertEquals(2, extracted.size());
Assert.assertTrue(extracted.contains("druid"));
Assert.assertTrue(extracted.contains("dash"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
Assert.assertEquals(expected, extracted);
}
@Test
public void testDeeperPathExtraction()
{
String regex = "^/([^/]+/[^/]+)(/|$)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();
for (String path : paths) {
extracted.add(extractionFn.apply(path));
}
Assert.assertEquals(4, extracted.size());
Assert.assertTrue(extracted.contains("druid/prod"));
Assert.assertTrue(extracted.contains("druid/demo"));
Assert.assertTrue(extracted.contains("dash/aloe"));
Assert.assertTrue(extracted.contains("dash/baloo"));
Set<String> expected = Sets.newLinkedHashSet(
ImmutableList.of(
"druid/prod", "druid/demo",
"dash/aloe", "dash/baloo"
)
);
Assert.assertEquals(expected, extracted);
}
@Test
public void testStringExtraction()
{
String regex = "(.)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();
for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}
Assert.assertEquals(3, extracted.size());
Assert.assertTrue(extracted.contains("a"));
Assert.assertTrue(extracted.contains("b"));
Assert.assertTrue(extracted.contains("c"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
Assert.assertEquals(expected, extracted);
}
@ -107,7 +107,7 @@ public class RegexDimExtractionFnTest
public void testNullAndEmpty()
{
String regex = "(.*)/.*/.*";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
// no match, map empty input value to null
Assert.assertEquals(null, extractionFn.apply(""));
// null value, returns null
@ -116,14 +116,54 @@ public class RegexDimExtractionFnTest
Assert.assertEquals(null, extractionFn.apply("/a/b"));
}
@Test
public void testMissingValueReplacement()
{
String regex = "(a\\w*)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
Set<String> extracted = Sets.newLinkedHashSet();
for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
Assert.assertEquals(expected, extracted);
byte[] cacheKey = extractionFn.getCacheKey();
byte[] expectedCacheKey = new byte[]{
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
Set<String> extracted2 = Sets.newLinkedHashSet();
for (String testString : testStrings) {
extracted2.add(nullExtractionFn.apply(testString));
}
Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
expected2.add(null);
Assert.assertEquals(expected2, extracted2);
cacheKey = nullExtractionFn.getCacheKey();
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
}
@Test
public void testSerde() throws Exception
{
final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }";
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
Assert.assertEquals(".(...)?", extractionFn.getExpr());
Assert.assertTrue(extractionFn.isReplaceMissingValues());
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());
// round trip
Assert.assertEquals(

View File

@ -730,7 +730,7 @@ public class GroupByQueryRunnerTest
@Test
public void testGroupByWithNullProducingDimExtractionFn()
{
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()
@ -797,7 +797,7 @@ public class GroupByQueryRunnerTest
*/
public void testGroupByWithEmptyStringProducingDimExtractionFn()
{
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()

View File

@ -1519,7 +1519,7 @@ public class TopNQueryRunnerTest
.dimension(
new ExtractionDimensionSpec(
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
new RegexDimExtractionFn(".(.)"), null
new RegexDimExtractionFn(".(.)", false, null), null
)
)
.metric("index")
@ -1568,7 +1568,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
@ -2074,7 +2074,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
@ -2128,7 +2128,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)
@ -2182,7 +2182,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
@ -2300,7 +2300,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
@ -2347,7 +2347,7 @@ public class TopNQueryRunnerTest
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)