Add topic name as a column in the Kafka Input format (#14857)

This PR adds a way to store the topic name in a column. Such a column can be used to distinguish messages coming from different topics in multi-topic ingestion.
This commit is contained in:
Abhishek Agarwal 2023-08-21 21:32:34 +05:30 committed by GitHub
parent 92906059d2
commit a38b4f0491
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 65 additions and 17 deletions

View File

@ -40,6 +40,7 @@ public class KafkaInputFormat implements InputFormat
{
private static final String DEFAULT_HEADER_COLUMN_PREFIX = "kafka.header.";
private static final String DEFAULT_TIMESTAMP_COLUMN_NAME = "kafka.timestamp";
private static final String DEFAULT_TOPIC_COLUMN_NAME = "kafka.topic";
private static final String DEFAULT_KEY_COLUMN_NAME = "kafka.key";
public static final String DEFAULT_AUTO_TIMESTAMP_STRING = "__kif_auto_timestamp";
@ -54,6 +55,7 @@ public class KafkaInputFormat implements InputFormat
private final String headerColumnPrefix;
private final String keyColumnName;
private final String timestampColumnName;
private final String topicColumnName;
public KafkaInputFormat(
@JsonProperty("headerFormat") @Nullable KafkaHeaderFormat headerFormat,
@ -61,7 +63,8 @@ public class KafkaInputFormat implements InputFormat
@JsonProperty("valueFormat") InputFormat valueFormat,
@JsonProperty("headerColumnPrefix") @Nullable String headerColumnPrefix,
@JsonProperty("keyColumnName") @Nullable String keyColumnName,
@JsonProperty("timestampColumnName") @Nullable String timestampColumnName
@JsonProperty("timestampColumnName") @Nullable String timestampColumnName,
@JsonProperty("topicColumnName") @Nullable String topicColumnName
)
{
this.headerFormat = headerFormat;
@ -70,6 +73,7 @@ public class KafkaInputFormat implements InputFormat
this.headerColumnPrefix = headerColumnPrefix != null ? headerColumnPrefix : DEFAULT_HEADER_COLUMN_PREFIX;
this.keyColumnName = keyColumnName != null ? keyColumnName : DEFAULT_KEY_COLUMN_NAME;
this.timestampColumnName = timestampColumnName != null ? timestampColumnName : DEFAULT_TIMESTAMP_COLUMN_NAME;
this.topicColumnName = topicColumnName != null ? topicColumnName : DEFAULT_TOPIC_COLUMN_NAME;
}
@Override
@ -116,7 +120,8 @@ public class KafkaInputFormat implements InputFormat
temporaryDirectory
),
keyColumnName,
timestampColumnName
timestampColumnName,
topicColumnName
);
}
@ -161,6 +166,13 @@ public class KafkaInputFormat implements InputFormat
return timestampColumnName;
}
@Nullable
@JsonProperty
public String getTopicColumnName()
{
return topicColumnName;
}
@Override
public boolean equals(Object o)
{
@ -176,14 +188,15 @@ public class KafkaInputFormat implements InputFormat
&& Objects.equals(keyFormat, that.keyFormat)
&& Objects.equals(headerColumnPrefix, that.headerColumnPrefix)
&& Objects.equals(keyColumnName, that.keyColumnName)
&& Objects.equals(timestampColumnName, that.timestampColumnName);
&& Objects.equals(timestampColumnName, that.timestampColumnName)
&& Objects.equals(topicColumnName, that.topicColumnName);
}
@Override
public int hashCode()
{
return Objects.hash(headerFormat, valueFormat, keyFormat,
headerColumnPrefix, keyColumnName, timestampColumnName
headerColumnPrefix, keyColumnName, timestampColumnName, topicColumnName
);
}
}

View File

@ -56,6 +56,7 @@ public class KafkaInputReader implements InputEntityReader
private final InputEntityReader valueParser;
private final String keyColumnName;
private final String timestampColumnName;
private final String topicColumnName;
/**
*
@ -74,7 +75,8 @@ public class KafkaInputReader implements InputEntityReader
@Nullable Function<KafkaRecordEntity, InputEntityReader> keyParserSupplier,
InputEntityReader valueParser,
String keyColumnName,
String timestampColumnName
String timestampColumnName,
String topicColumnName
)
{
this.inputRowSchema = inputRowSchema;
@ -84,6 +86,7 @@ public class KafkaInputReader implements InputEntityReader
this.valueParser = valueParser;
this.keyColumnName = keyColumnName;
this.timestampColumnName = timestampColumnName;
this.topicColumnName = topicColumnName;
}
@Override
@ -128,6 +131,9 @@ public class KafkaInputReader implements InputEntityReader
// the header list
mergedHeaderMap.putIfAbsent(timestampColumnName, record.getRecord().timestamp());
// Add kafka record topic to the mergelist, only if the key doesn't already exist
mergedHeaderMap.putIfAbsent(topicColumnName, record.getRecord().topic());
return mergedHeaderMap;
}

View File

@ -59,6 +59,7 @@ public class KafkaInputFormatTest
{
private KafkaRecordEntity inputEntity;
private final long timestamp = DateTimes.of("2021-06-24").getMillis();
private static final String TOPIC = "sample";
private static final Iterable<Header> SAMPLE_HEADERS = ImmutableList.of(
new Header()
{
@ -126,7 +127,8 @@ public class KafkaInputFormatTest
),
"kafka.newheader.",
"kafka.newkey.key",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
);
}
@ -166,7 +168,8 @@ public class KafkaInputFormatTest
),
"kafka.newheader.",
"kafka.newkey.key",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
);
Assert.assertEquals(format, kif);
@ -209,7 +212,8 @@ public class KafkaInputFormatTest
"foo",
"kafka.newheader.encoding",
"kafka.newheader.kafkapkc",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
)
)
),
@ -231,7 +235,8 @@ public class KafkaInputFormatTest
"foo",
"kafka.newheader.encoding",
"kafka.newheader.kafkapkc",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
),
row.getDimensions()
);
@ -254,6 +259,10 @@ public class KafkaInputFormatTest
String.valueOf(DateTimes.of("2021-06-24").getMillis()),
Iterables.getOnlyElement(row.getDimension("kafka.newts.timestamp"))
);
Assert.assertEquals(
TOPIC,
Iterables.getOnlyElement(row.getDimension("kafka.newtopic.topic"))
);
Assert.assertEquals(
"2021-06-25",
Iterables.getOnlyElement(row.getDimension("timestamp"))
@ -302,7 +311,8 @@ public class KafkaInputFormatTest
"foo",
"kafka.newheader.encoding",
"kafka.newheader.kafkapkc",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
)
)
),
@ -478,7 +488,7 @@ public class KafkaInputFormatTest
null, null, false, //make sure JsonReader is used
false, false
),
"kafka.newheader.", "kafka.newkey.", "kafka.newts."
"kafka.newheader.", "kafka.newkey.", "kafka.newts.", "kafka.newtopic."
);
final InputEntityReader reader = localFormat.createReader(
@ -489,7 +499,8 @@ public class KafkaInputFormatTest
ImmutableList.of(
"bar",
"foo",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
)
)
),
@ -567,7 +578,8 @@ public class KafkaInputFormatTest
"foo",
"kafka.newheader.encoding",
"kafka.newheader.kafkapkc",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
)
)
),
@ -613,6 +625,10 @@ public class KafkaInputFormatTest
String.valueOf(DateTimes.of("2021-06-24").getMillis()),
Iterables.getOnlyElement(row.getDimension("kafka.newts.timestamp"))
);
Assert.assertEquals(
TOPIC,
Iterables.getOnlyElement(row.getDimension("kafka.newtopic.topic"))
);
Assert.assertEquals(String.valueOf(i), Iterables.getOnlyElement(row.getDimension("kafka.newheader.indexH")));
@ -669,7 +685,8 @@ public class KafkaInputFormatTest
"foo",
"kafka.newheader.encoding",
"kafka.newheader.kafkapkc",
"kafka.newts.timestamp"
"kafka.newts.timestamp",
"kafka.newtopic.topic"
)
)
),
@ -683,7 +700,8 @@ public class KafkaInputFormatTest
while (iterator.hasNext()) {
Throwable t = Assert.assertThrows(ParseException.class, () -> iterator.next());
Assert.assertEquals(
"Timestamp[null] is unparseable! Event: {foo=x, kafka.newts.timestamp=1624492800000, kafka.newkey.key=sampleKey, root_baz=4, bar=null, kafka...",
"Timestamp[null] is unparseable! Event: {kafka.newtopic.topic=sample, foo=x, kafka.newts"
+ ".timestamp=1624492800000, kafka.newkey.key=sampleKey...",
t.getMessage()
);
}
@ -733,6 +751,7 @@ public class KafkaInputFormatTest
final InputRow row = iterator.next();
Assert.assertEquals(
Arrays.asList(
"kafka.newtopic.topic",
"foo",
"kafka.newts.timestamp",
"kafka.newkey.key",
@ -767,6 +786,10 @@ public class KafkaInputFormatTest
String.valueOf(DateTimes.of("2021-06-24").getMillis()),
Iterables.getOnlyElement(row.getDimension("kafka.newts.timestamp"))
);
Assert.assertEquals(
TOPIC,
Iterables.getOnlyElement(row.getDimension("kafka.newtopic.topic"))
);
Assert.assertEquals(
"2021-06-25",
Iterables.getOnlyElement(row.getDimension("timestamp"))
@ -834,6 +857,7 @@ public class KafkaInputFormatTest
Arrays.asList(
"bar",
"kafka.newheader.kafkapkc",
"kafka.newtopic.topic",
"foo",
"kafka.newts.timestamp",
"kafka.newkey.key",
@ -866,6 +890,10 @@ public class KafkaInputFormatTest
String.valueOf(DateTimes.of("2021-06-24").getMillis()),
Iterables.getOnlyElement(row.getDimension("kafka.newts.timestamp"))
);
Assert.assertEquals(
TOPIC,
Iterables.getOnlyElement(row.getDimension("kafka.newtopic.topic"))
);
Assert.assertEquals(
"2021-06-25",
Iterables.getOnlyElement(row.getDimension("timestamp"))
@ -889,7 +917,7 @@ public class KafkaInputFormatTest
{
return new KafkaRecordEntity(
new ConsumerRecord<>(
"sample",
TOPIC,
0,
0,
timestamp,

View File

@ -171,7 +171,7 @@ public class KafkaIndexTaskTest extends SeekableStreamIndexTaskTestBase
new KafkaStringHeaderFormat(null),
INPUT_FORMAT,
INPUT_FORMAT,
"kafka.testheader.", "kafka.key", "kafka.timestamp"
"kafka.testheader.", "kafka.key", "kafka.timestamp", "kafka.topic"
);
private static TestingCluster zkServer;

View File

@ -277,6 +277,7 @@ public class KafkaSamplerSpecTest extends InitializedNullHandlingTest
new JsonInputFormat(JSONPathSpec.DEFAULT, null, null, null, null),
null,
null,
null,
null
),