mirror of https://github.com/apache/druid.git
Merge pull request #2075 from jon-wei/regex_extract
Configurable value replacement on match failure for RegexExtractionFn
This commit is contained in:
commit
e7f06cf61c
|
@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
|
|||
If there is no match, it returns the dimension value as is.
|
||||
|
||||
```json
|
||||
{ "type" : "regex", "expr" : <regular_expression> }
|
||||
{
|
||||
"type" : "regex", "expr" : <regular_expression>,
|
||||
"replaceMissingValues" : true,
|
||||
"replaceMissingValuesWith" : "foobar"
|
||||
}
|
||||
```
|
||||
|
||||
For example, using `"expr" : "(\\w\\w\\w).*"` will transform
|
||||
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.
|
||||
|
||||
If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.
|
||||
|
||||
The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.
|
||||
|
||||
For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.
|
||||
|
||||
|
||||
### Partial Extraction Function
|
||||
|
||||
Returns the dimension value unchanged if the regular expression matches, otherwise returns null.
|
||||
|
|
|
@ -34,28 +34,53 @@ import java.util.regex.Pattern;
|
|||
public class RegexDimExtractionFn extends DimExtractionFn
|
||||
{
|
||||
private static final byte CACHE_TYPE_ID = 0x1;
|
||||
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;
|
||||
|
||||
private final String expr;
|
||||
private final Pattern pattern;
|
||||
private final boolean replaceMissingValues;
|
||||
private final String replaceMissingValuesWith;
|
||||
|
||||
@JsonCreator
|
||||
public RegexDimExtractionFn(
|
||||
@JsonProperty("expr") String expr
|
||||
@JsonProperty("expr") String expr,
|
||||
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
|
||||
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
|
||||
)
|
||||
{
|
||||
Preconditions.checkNotNull(expr, "expr must not be null");
|
||||
|
||||
this.expr = expr;
|
||||
this.pattern = Pattern.compile(expr);
|
||||
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
|
||||
this.replaceMissingValuesWith = replaceMissingValuesWith;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getCacheKey()
|
||||
{
|
||||
byte[] exprBytes = StringUtils.toUtf8(expr);
|
||||
return ByteBuffer.allocate(1 + exprBytes.length)
|
||||
byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
|
||||
byte[] replaceStrBytes;
|
||||
if (replaceMissingValuesWith == null) {
|
||||
replaceStrBytes = new byte[]{};
|
||||
} else {
|
||||
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
|
||||
}
|
||||
|
||||
int totalLen = 1
|
||||
+ exprBytes.length
|
||||
+ replaceBytes.length
|
||||
+ replaceStrBytes.length; // fields
|
||||
totalLen += 2; // separators
|
||||
|
||||
return ByteBuffer.allocate(totalLen)
|
||||
.put(CACHE_TYPE_ID)
|
||||
.put(exprBytes)
|
||||
.put(CACHE_KEY_SEPARATOR)
|
||||
.put(replaceStrBytes)
|
||||
.put(CACHE_KEY_SEPARATOR)
|
||||
.put(replaceBytes)
|
||||
.array();
|
||||
}
|
||||
|
||||
|
@ -65,8 +90,14 @@ public class RegexDimExtractionFn extends DimExtractionFn
|
|||
if (dimValue == null) {
|
||||
return null;
|
||||
}
|
||||
String retVal;
|
||||
Matcher matcher = pattern.matcher(dimValue);
|
||||
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue);
|
||||
if (matcher.find()) {
|
||||
retVal = matcher.group(1);
|
||||
} else {
|
||||
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
|
||||
}
|
||||
return Strings.emptyToNull(retVal);
|
||||
}
|
||||
|
||||
@JsonProperty("expr")
|
||||
|
@ -75,6 +106,18 @@ public class RegexDimExtractionFn extends DimExtractionFn
|
|||
return expr;
|
||||
}
|
||||
|
||||
@JsonProperty("replaceMissingValues")
|
||||
public boolean isReplaceMissingValues()
|
||||
{
|
||||
return replaceMissingValues;
|
||||
}
|
||||
|
||||
@JsonProperty("replaceMissingValuesWith")
|
||||
public String getReplaceMissingValuesWith()
|
||||
{
|
||||
return replaceMissingValuesWith;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean preservesOrdering()
|
||||
{
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package io.druid.query.extraction;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Sets;
|
||||
import io.druid.jackson.DefaultObjectMapper;
|
||||
import org.junit.Assert;
|
||||
|
@ -55,51 +56,50 @@ public class RegexDimExtractionFnTest
|
|||
public void testPathExtraction()
|
||||
{
|
||||
String regex = "/([^/]+)/";
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
||||
Set<String> extracted = Sets.newHashSet();
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||
Set<String> extracted = Sets.newLinkedHashSet();
|
||||
|
||||
for (String path : paths) {
|
||||
extracted.add(extractionFn.apply(path));
|
||||
}
|
||||
|
||||
Assert.assertEquals(2, extracted.size());
|
||||
Assert.assertTrue(extracted.contains("druid"));
|
||||
Assert.assertTrue(extracted.contains("dash"));
|
||||
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
|
||||
Assert.assertEquals(expected, extracted);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeeperPathExtraction()
|
||||
{
|
||||
String regex = "^/([^/]+/[^/]+)(/|$)";
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
||||
Set<String> extracted = Sets.newHashSet();
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||
Set<String> extracted = Sets.newLinkedHashSet();
|
||||
|
||||
for (String path : paths) {
|
||||
extracted.add(extractionFn.apply(path));
|
||||
}
|
||||
|
||||
Assert.assertEquals(4, extracted.size());
|
||||
Assert.assertTrue(extracted.contains("druid/prod"));
|
||||
Assert.assertTrue(extracted.contains("druid/demo"));
|
||||
Assert.assertTrue(extracted.contains("dash/aloe"));
|
||||
Assert.assertTrue(extracted.contains("dash/baloo"));
|
||||
Set<String> expected = Sets.newLinkedHashSet(
|
||||
ImmutableList.of(
|
||||
"druid/prod", "druid/demo",
|
||||
"dash/aloe", "dash/baloo"
|
||||
)
|
||||
);
|
||||
Assert.assertEquals(expected, extracted);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStringExtraction()
|
||||
{
|
||||
String regex = "(.)";
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
||||
Set<String> extracted = Sets.newHashSet();
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||
Set<String> extracted = Sets.newLinkedHashSet();
|
||||
|
||||
for (String testString : testStrings) {
|
||||
extracted.add(extractionFn.apply(testString));
|
||||
}
|
||||
|
||||
Assert.assertEquals(3, extracted.size());
|
||||
Assert.assertTrue(extracted.contains("a"));
|
||||
Assert.assertTrue(extracted.contains("b"));
|
||||
Assert.assertTrue(extracted.contains("c"));
|
||||
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
|
||||
Assert.assertEquals(expected, extracted);
|
||||
}
|
||||
|
||||
|
||||
|
@ -107,7 +107,7 @@ public class RegexDimExtractionFnTest
|
|||
public void testNullAndEmpty()
|
||||
{
|
||||
String regex = "(.*)/.*/.*";
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||
// no match, map empty input value to null
|
||||
Assert.assertEquals(null, extractionFn.apply(""));
|
||||
// null value, returns null
|
||||
|
@ -116,14 +116,54 @@ public class RegexDimExtractionFnTest
|
|||
Assert.assertEquals(null, extractionFn.apply("/a/b"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMissingValueReplacement()
|
||||
{
|
||||
String regex = "(a\\w*)";
|
||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
|
||||
Set<String> extracted = Sets.newLinkedHashSet();
|
||||
|
||||
for (String testString : testStrings) {
|
||||
extracted.add(extractionFn.apply(testString));
|
||||
}
|
||||
|
||||
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
|
||||
Assert.assertEquals(expected, extracted);
|
||||
|
||||
byte[] cacheKey = extractionFn.getCacheKey();
|
||||
byte[] expectedCacheKey = new byte[]{
|
||||
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
|
||||
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
|
||||
};
|
||||
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
|
||||
|
||||
ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
|
||||
Set<String> extracted2 = Sets.newLinkedHashSet();
|
||||
|
||||
for (String testString : testStrings) {
|
||||
extracted2.add(nullExtractionFn.apply(testString));
|
||||
}
|
||||
|
||||
Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
|
||||
expected2.add(null);
|
||||
Assert.assertEquals(expected2, extracted2);
|
||||
|
||||
cacheKey = nullExtractionFn.getCacheKey();
|
||||
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
|
||||
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSerde() throws Exception
|
||||
{
|
||||
final ObjectMapper objectMapper = new DefaultObjectMapper();
|
||||
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }";
|
||||
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
|
||||
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
|
||||
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
|
||||
|
||||
Assert.assertEquals(".(...)?", extractionFn.getExpr());
|
||||
Assert.assertTrue(extractionFn.isReplaceMissingValues());
|
||||
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());
|
||||
|
||||
// round trip
|
||||
Assert.assertEquals(
|
||||
|
|
|
@ -730,7 +730,7 @@ public class GroupByQueryRunnerTest
|
|||
@Test
|
||||
public void testGroupByWithNullProducingDimExtractionFn()
|
||||
{
|
||||
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})")
|
||||
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
|
||||
{
|
||||
@Override
|
||||
public byte[] getCacheKey()
|
||||
|
@ -797,7 +797,7 @@ public class GroupByQueryRunnerTest
|
|||
*/
|
||||
public void testGroupByWithEmptyStringProducingDimExtractionFn()
|
||||
{
|
||||
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})")
|
||||
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
|
||||
{
|
||||
@Override
|
||||
public byte[] getCacheKey()
|
||||
|
|
|
@ -1519,7 +1519,7 @@ public class TopNQueryRunnerTest
|
|||
.dimension(
|
||||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
|
||||
new RegexDimExtractionFn(".(.)"), null
|
||||
new RegexDimExtractionFn(".(.)", false, null), null
|
||||
)
|
||||
)
|
||||
.metric("index")
|
||||
|
@ -1568,7 +1568,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("(.)"),
|
||||
new RegexDimExtractionFn("(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
@ -2074,7 +2074,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("(.)"),
|
||||
new RegexDimExtractionFn("(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
@ -2128,7 +2128,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("..(.)"),
|
||||
new RegexDimExtractionFn("..(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
@ -2182,7 +2182,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("(.)"),
|
||||
new RegexDimExtractionFn("(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
@ -2300,7 +2300,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("(.)"),
|
||||
new RegexDimExtractionFn("(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
@ -2347,7 +2347,7 @@ public class TopNQueryRunnerTest
|
|||
new ExtractionDimensionSpec(
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
QueryRunnerTestHelper.marketDimension,
|
||||
new RegexDimExtractionFn("..(.)"),
|
||||
new RegexDimExtractionFn("..(.)", false, null),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue