mirror of https://github.com/apache/druid.git
Merge pull request #2075 from jon-wei/regex_extract
Configurable value replacement on match failure for RegexExtractionFn
This commit is contained in:
commit
e7f06cf61c
|
@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
|
||||||
If there is no match, it returns the dimension value as is.
|
If there is no match, it returns the dimension value as is.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{ "type" : "regex", "expr" : <regular_expression> }
|
{
|
||||||
|
"type" : "regex", "expr" : <regular_expression>,
|
||||||
|
"replaceMissingValues" : true,
|
||||||
|
"replaceMissingValuesWith" : "foobar"
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
For example, using `"expr" : "(\\w\\w\\w).*"` will transform
|
For example, using `"expr" : "(\\w\\w\\w).*"` will transform
|
||||||
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.
|
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.
|
||||||
|
|
||||||
|
If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.
|
||||||
|
|
||||||
|
The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.
|
||||||
|
|
||||||
|
For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.
|
||||||
|
|
||||||
|
|
||||||
### Partial Extraction Function
|
### Partial Extraction Function
|
||||||
|
|
||||||
Returns the dimension value unchanged if the regular expression matches, otherwise returns null.
|
Returns the dimension value unchanged if the regular expression matches, otherwise returns null.
|
||||||
|
|
|
@ -34,28 +34,53 @@ import java.util.regex.Pattern;
|
||||||
public class RegexDimExtractionFn extends DimExtractionFn
|
public class RegexDimExtractionFn extends DimExtractionFn
|
||||||
{
|
{
|
||||||
private static final byte CACHE_TYPE_ID = 0x1;
|
private static final byte CACHE_TYPE_ID = 0x1;
|
||||||
|
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;
|
||||||
|
|
||||||
private final String expr;
|
private final String expr;
|
||||||
private final Pattern pattern;
|
private final Pattern pattern;
|
||||||
|
private final boolean replaceMissingValues;
|
||||||
|
private final String replaceMissingValuesWith;
|
||||||
|
|
||||||
@JsonCreator
|
@JsonCreator
|
||||||
public RegexDimExtractionFn(
|
public RegexDimExtractionFn(
|
||||||
@JsonProperty("expr") String expr
|
@JsonProperty("expr") String expr,
|
||||||
|
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
|
||||||
|
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
Preconditions.checkNotNull(expr, "expr must not be null");
|
Preconditions.checkNotNull(expr, "expr must not be null");
|
||||||
|
|
||||||
this.expr = expr;
|
this.expr = expr;
|
||||||
this.pattern = Pattern.compile(expr);
|
this.pattern = Pattern.compile(expr);
|
||||||
|
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
|
||||||
|
this.replaceMissingValuesWith = replaceMissingValuesWith;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public byte[] getCacheKey()
|
public byte[] getCacheKey()
|
||||||
{
|
{
|
||||||
byte[] exprBytes = StringUtils.toUtf8(expr);
|
byte[] exprBytes = StringUtils.toUtf8(expr);
|
||||||
return ByteBuffer.allocate(1 + exprBytes.length)
|
byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
|
||||||
|
byte[] replaceStrBytes;
|
||||||
|
if (replaceMissingValuesWith == null) {
|
||||||
|
replaceStrBytes = new byte[]{};
|
||||||
|
} else {
|
||||||
|
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalLen = 1
|
||||||
|
+ exprBytes.length
|
||||||
|
+ replaceBytes.length
|
||||||
|
+ replaceStrBytes.length; // fields
|
||||||
|
totalLen += 2; // separators
|
||||||
|
|
||||||
|
return ByteBuffer.allocate(totalLen)
|
||||||
.put(CACHE_TYPE_ID)
|
.put(CACHE_TYPE_ID)
|
||||||
.put(exprBytes)
|
.put(exprBytes)
|
||||||
|
.put(CACHE_KEY_SEPARATOR)
|
||||||
|
.put(replaceStrBytes)
|
||||||
|
.put(CACHE_KEY_SEPARATOR)
|
||||||
|
.put(replaceBytes)
|
||||||
.array();
|
.array();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,8 +90,14 @@ public class RegexDimExtractionFn extends DimExtractionFn
|
||||||
if (dimValue == null) {
|
if (dimValue == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
String retVal;
|
||||||
Matcher matcher = pattern.matcher(dimValue);
|
Matcher matcher = pattern.matcher(dimValue);
|
||||||
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue);
|
if (matcher.find()) {
|
||||||
|
retVal = matcher.group(1);
|
||||||
|
} else {
|
||||||
|
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
|
||||||
|
}
|
||||||
|
return Strings.emptyToNull(retVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonProperty("expr")
|
@JsonProperty("expr")
|
||||||
|
@ -75,6 +106,18 @@ public class RegexDimExtractionFn extends DimExtractionFn
|
||||||
return expr;
|
return expr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@JsonProperty("replaceMissingValues")
|
||||||
|
public boolean isReplaceMissingValues()
|
||||||
|
{
|
||||||
|
return replaceMissingValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
@JsonProperty("replaceMissingValuesWith")
|
||||||
|
public String getReplaceMissingValuesWith()
|
||||||
|
{
|
||||||
|
return replaceMissingValuesWith;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean preservesOrdering()
|
public boolean preservesOrdering()
|
||||||
{
|
{
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package io.druid.query.extraction;
|
package io.druid.query.extraction;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import io.druid.jackson.DefaultObjectMapper;
|
import io.druid.jackson.DefaultObjectMapper;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
|
@ -55,51 +56,50 @@ public class RegexDimExtractionFnTest
|
||||||
public void testPathExtraction()
|
public void testPathExtraction()
|
||||||
{
|
{
|
||||||
String regex = "/([^/]+)/";
|
String regex = "/([^/]+)/";
|
||||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||||
Set<String> extracted = Sets.newHashSet();
|
Set<String> extracted = Sets.newLinkedHashSet();
|
||||||
|
|
||||||
for (String path : paths) {
|
for (String path : paths) {
|
||||||
extracted.add(extractionFn.apply(path));
|
extracted.add(extractionFn.apply(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert.assertEquals(2, extracted.size());
|
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
|
||||||
Assert.assertTrue(extracted.contains("druid"));
|
Assert.assertEquals(expected, extracted);
|
||||||
Assert.assertTrue(extracted.contains("dash"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDeeperPathExtraction()
|
public void testDeeperPathExtraction()
|
||||||
{
|
{
|
||||||
String regex = "^/([^/]+/[^/]+)(/|$)";
|
String regex = "^/([^/]+/[^/]+)(/|$)";
|
||||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||||
Set<String> extracted = Sets.newHashSet();
|
Set<String> extracted = Sets.newLinkedHashSet();
|
||||||
|
|
||||||
for (String path : paths) {
|
for (String path : paths) {
|
||||||
extracted.add(extractionFn.apply(path));
|
extracted.add(extractionFn.apply(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert.assertEquals(4, extracted.size());
|
Set<String> expected = Sets.newLinkedHashSet(
|
||||||
Assert.assertTrue(extracted.contains("druid/prod"));
|
ImmutableList.of(
|
||||||
Assert.assertTrue(extracted.contains("druid/demo"));
|
"druid/prod", "druid/demo",
|
||||||
Assert.assertTrue(extracted.contains("dash/aloe"));
|
"dash/aloe", "dash/baloo"
|
||||||
Assert.assertTrue(extracted.contains("dash/baloo"));
|
)
|
||||||
|
);
|
||||||
|
Assert.assertEquals(expected, extracted);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testStringExtraction()
|
public void testStringExtraction()
|
||||||
{
|
{
|
||||||
String regex = "(.)";
|
String regex = "(.)";
|
||||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||||
Set<String> extracted = Sets.newHashSet();
|
Set<String> extracted = Sets.newLinkedHashSet();
|
||||||
|
|
||||||
for (String testString : testStrings) {
|
for (String testString : testStrings) {
|
||||||
extracted.add(extractionFn.apply(testString));
|
extracted.add(extractionFn.apply(testString));
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert.assertEquals(3, extracted.size());
|
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
|
||||||
Assert.assertTrue(extracted.contains("a"));
|
Assert.assertEquals(expected, extracted);
|
||||||
Assert.assertTrue(extracted.contains("b"));
|
|
||||||
Assert.assertTrue(extracted.contains("c"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ public class RegexDimExtractionFnTest
|
||||||
public void testNullAndEmpty()
|
public void testNullAndEmpty()
|
||||||
{
|
{
|
||||||
String regex = "(.*)/.*/.*";
|
String regex = "(.*)/.*/.*";
|
||||||
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
|
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
|
||||||
// no match, map empty input value to null
|
// no match, map empty input value to null
|
||||||
Assert.assertEquals(null, extractionFn.apply(""));
|
Assert.assertEquals(null, extractionFn.apply(""));
|
||||||
// null value, returns null
|
// null value, returns null
|
||||||
|
@ -116,14 +116,54 @@ public class RegexDimExtractionFnTest
|
||||||
Assert.assertEquals(null, extractionFn.apply("/a/b"));
|
Assert.assertEquals(null, extractionFn.apply("/a/b"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingValueReplacement()
|
||||||
|
{
|
||||||
|
String regex = "(a\\w*)";
|
||||||
|
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
|
||||||
|
Set<String> extracted = Sets.newLinkedHashSet();
|
||||||
|
|
||||||
|
for (String testString : testStrings) {
|
||||||
|
extracted.add(extractionFn.apply(testString));
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
|
||||||
|
Assert.assertEquals(expected, extracted);
|
||||||
|
|
||||||
|
byte[] cacheKey = extractionFn.getCacheKey();
|
||||||
|
byte[] expectedCacheKey = new byte[]{
|
||||||
|
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
|
||||||
|
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
|
||||||
|
};
|
||||||
|
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
|
||||||
|
|
||||||
|
ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
|
||||||
|
Set<String> extracted2 = Sets.newLinkedHashSet();
|
||||||
|
|
||||||
|
for (String testString : testStrings) {
|
||||||
|
extracted2.add(nullExtractionFn.apply(testString));
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
|
||||||
|
expected2.add(null);
|
||||||
|
Assert.assertEquals(expected2, extracted2);
|
||||||
|
|
||||||
|
cacheKey = nullExtractionFn.getCacheKey();
|
||||||
|
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
|
||||||
|
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSerde() throws Exception
|
public void testSerde() throws Exception
|
||||||
{
|
{
|
||||||
final ObjectMapper objectMapper = new DefaultObjectMapper();
|
final ObjectMapper objectMapper = new DefaultObjectMapper();
|
||||||
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }";
|
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
|
||||||
|
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
|
||||||
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
|
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);
|
||||||
|
|
||||||
Assert.assertEquals(".(...)?", extractionFn.getExpr());
|
Assert.assertEquals(".(...)?", extractionFn.getExpr());
|
||||||
|
Assert.assertTrue(extractionFn.isReplaceMissingValues());
|
||||||
|
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());
|
||||||
|
|
||||||
// round trip
|
// round trip
|
||||||
Assert.assertEquals(
|
Assert.assertEquals(
|
||||||
|
|
|
@ -730,7 +730,7 @@ public class GroupByQueryRunnerTest
|
||||||
@Test
|
@Test
|
||||||
public void testGroupByWithNullProducingDimExtractionFn()
|
public void testGroupByWithNullProducingDimExtractionFn()
|
||||||
{
|
{
|
||||||
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})")
|
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
public byte[] getCacheKey()
|
public byte[] getCacheKey()
|
||||||
|
@ -797,7 +797,7 @@ public class GroupByQueryRunnerTest
|
||||||
*/
|
*/
|
||||||
public void testGroupByWithEmptyStringProducingDimExtractionFn()
|
public void testGroupByWithEmptyStringProducingDimExtractionFn()
|
||||||
{
|
{
|
||||||
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})")
|
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
public byte[] getCacheKey()
|
public byte[] getCacheKey()
|
||||||
|
|
|
@ -1519,7 +1519,7 @@ public class TopNQueryRunnerTest
|
||||||
.dimension(
|
.dimension(
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
|
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
|
||||||
new RegexDimExtractionFn(".(.)"), null
|
new RegexDimExtractionFn(".(.)", false, null), null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.metric("index")
|
.metric("index")
|
||||||
|
@ -1568,7 +1568,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("(.)"),
|
new RegexDimExtractionFn("(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -2074,7 +2074,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("(.)"),
|
new RegexDimExtractionFn("(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -2128,7 +2128,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("..(.)"),
|
new RegexDimExtractionFn("..(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -2182,7 +2182,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("(.)"),
|
new RegexDimExtractionFn("(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -2300,7 +2300,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("(.)"),
|
new RegexDimExtractionFn("(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -2347,7 +2347,7 @@ public class TopNQueryRunnerTest
|
||||||
new ExtractionDimensionSpec(
|
new ExtractionDimensionSpec(
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
QueryRunnerTestHelper.marketDimension,
|
QueryRunnerTestHelper.marketDimension,
|
||||||
new RegexDimExtractionFn("..(.)"),
|
new RegexDimExtractionFn("..(.)", false, null),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue