mirror of
https://github.com/apache/druid.git
synced 2025-02-26 13:16:30 +00:00
Fix create segment phase of batch ingestion to take segment identifiers that have a non UTC interval… (#11635)
* Fix create segment phase of batch ingestion to take segment identifiers with non UTC time zones * Fix comment and LGTM forbidden error
This commit is contained in:
parent
adeae3960f
commit
2405a9f25e
@ -84,7 +84,7 @@ import java.util.stream.Collectors;
|
|||||||
* reasons, the code for creating segments was all handled by the same code path in that class. The code
|
* reasons, the code for creating segments was all handled by the same code path in that class. The code
|
||||||
* was correct but inefficient for batch ingestion from a memory perspective. If the input file being processed
|
* was correct but inefficient for batch ingestion from a memory perspective. If the input file being processed
|
||||||
* by batch ingestion had enough sinks & hydrants produced then it may run out of memory either in the
|
* by batch ingestion had enough sinks & hydrants produced then it may run out of memory either in the
|
||||||
* hydrant creation phase (append) of this class or in the merge hydrants phase. Therefore a new class,
|
* hydrant creation phase (append) of this class or in the merge hydrants phase. Therefore, a new class,
|
||||||
* {@code BatchAppenderator}, this class, was created to specialize in batch ingestion and the old class
|
* {@code BatchAppenderator}, this class, was created to specialize in batch ingestion and the old class
|
||||||
* for stream ingestion was renamed to {@link StreamAppenderator}.
|
* for stream ingestion was renamed to {@link StreamAppenderator}.
|
||||||
* <p>
|
* <p>
|
||||||
@ -321,10 +321,10 @@ public class BatchAppenderator implements Appenderator
|
|||||||
return new AppenderatorAddResult(identifier, sinksMetadata.get(identifier).numRowsInSegment, false);
|
return new AppenderatorAddResult(identifier, sinksMetadata.get(identifier).numRowsInSegment, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
/**
|
/**
|
||||||
* Returns all active segments regardless whether they are in memory or persisted
|
* Returns all active segments regardless whether they are in memory or persisted
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public List<SegmentIdWithShardSpec> getSegments()
|
public List<SegmentIdWithShardSpec> getSegments()
|
||||||
{
|
{
|
||||||
return ImmutableList.copyOf(sinksMetadata.keySet());
|
return ImmutableList.copyOf(sinksMetadata.keySet());
|
||||||
@ -568,9 +568,9 @@ public class BatchAppenderator implements Appenderator
|
|||||||
|
|
||||||
log.info("Preparing to push...");
|
log.info("Preparing to push...");
|
||||||
|
|
||||||
// get the dirs for the identfiers:
|
// Traverse identifiers, load their sink, and push it:
|
||||||
List<File> identifiersDirs = new ArrayList<>();
|
|
||||||
int totalHydrantsMerged = 0;
|
int totalHydrantsMerged = 0;
|
||||||
|
final List<DataSegment> dataSegments = new ArrayList<>();
|
||||||
for (SegmentIdWithShardSpec identifier : identifiers) {
|
for (SegmentIdWithShardSpec identifier : identifiers) {
|
||||||
SinkMetadata sm = sinksMetadata.get(identifier);
|
SinkMetadata sm = sinksMetadata.get(identifier);
|
||||||
if (sm == null) {
|
if (sm == null) {
|
||||||
@ -580,34 +580,28 @@ public class BatchAppenderator implements Appenderator
|
|||||||
if (persistedDir == null) {
|
if (persistedDir == null) {
|
||||||
throw new ISE("Sink for identifier[%s] not found in local file system", identifier);
|
throw new ISE("Sink for identifier[%s] not found in local file system", identifier);
|
||||||
}
|
}
|
||||||
identifiersDirs.add(persistedDir);
|
|
||||||
totalHydrantsMerged += sm.getNumHydrants();
|
totalHydrantsMerged += sm.getNumHydrants();
|
||||||
}
|
|
||||||
|
|
||||||
// push all sinks for identifiers:
|
|
||||||
final List<DataSegment> dataSegments = new ArrayList<>();
|
|
||||||
for (File identifier : identifiersDirs) {
|
|
||||||
|
|
||||||
// retrieve sink from disk:
|
// retrieve sink from disk:
|
||||||
Pair<SegmentIdWithShardSpec, Sink> identifiersAndSinks;
|
Sink sinkForIdentifier;
|
||||||
try {
|
try {
|
||||||
identifiersAndSinks = getIdentifierAndSinkForPersistedFile(identifier);
|
sinkForIdentifier = getSinkForIdentifierPath(identifier, persistedDir);
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
throw new ISE(e, "Failed to retrieve sinks for identifier[%s]", identifier);
|
throw new ISE(e, "Failed to retrieve sinks for identifier[%s] in path[%s]", identifier, persistedDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
// push it:
|
// push sink:
|
||||||
final DataSegment dataSegment = mergeAndPush(
|
final DataSegment dataSegment = mergeAndPush(
|
||||||
identifiersAndSinks.lhs,
|
identifier,
|
||||||
identifiersAndSinks.rhs
|
sinkForIdentifier
|
||||||
);
|
);
|
||||||
|
|
||||||
// record it:
|
// record it:
|
||||||
if (dataSegment != null) {
|
if (dataSegment != null) {
|
||||||
dataSegments.add(dataSegment);
|
dataSegments.add(dataSegment);
|
||||||
} else {
|
} else {
|
||||||
log.warn("mergeAndPush[%s] returned null, skipping.", identifiersAndSinks.lhs);
|
log.warn("mergeAndPush[%s] returned null, skipping.", identifier);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -862,15 +856,9 @@ public class BatchAppenderator implements Appenderator
|
|||||||
return retVal;
|
return retVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Pair<SegmentIdWithShardSpec, Sink> getIdentifierAndSinkForPersistedFile(File identifierPath)
|
private Sink getSinkForIdentifierPath(SegmentIdWithShardSpec identifier, File identifierPath)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
|
|
||||||
final SegmentIdWithShardSpec identifier = objectMapper.readValue(
|
|
||||||
new File(identifierPath, IDENTIFIER_FILE_NAME),
|
|
||||||
SegmentIdWithShardSpec.class
|
|
||||||
);
|
|
||||||
|
|
||||||
// To avoid reading and listing of "merged" dir and other special files
|
// To avoid reading and listing of "merged" dir and other special files
|
||||||
final File[] sinkFiles = identifierPath.listFiles(
|
final File[] sinkFiles = identifierPath.listFiles(
|
||||||
(dir, fileName) -> !(Ints.tryParse(fileName) == null)
|
(dir, fileName) -> !(Ints.tryParse(fileName) == null)
|
||||||
@ -901,7 +889,7 @@ public class BatchAppenderator implements Appenderator
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Sink currSink = new Sink(
|
Sink retVal = new Sink(
|
||||||
identifier.getInterval(),
|
identifier.getInterval(),
|
||||||
schema,
|
schema,
|
||||||
identifier.getShardSpec(),
|
identifier.getShardSpec(),
|
||||||
@ -912,8 +900,8 @@ public class BatchAppenderator implements Appenderator
|
|||||||
null,
|
null,
|
||||||
hydrants
|
hydrants
|
||||||
);
|
);
|
||||||
currSink.finishWriting(); // this sink is not writable
|
retVal.finishWriting(); // this sink is not writable
|
||||||
return new Pair<>(identifier, currSink);
|
return retVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function does not remove the sink from its tracking Map (sinks), the caller is responsible for that
|
// This function does not remove the sink from its tracking Map (sinks), the caller is responsible for that
|
||||||
|
@ -31,6 +31,9 @@ import org.apache.druid.segment.incremental.RowIngestionMeters;
|
|||||||
import org.apache.druid.segment.incremental.SimpleRowIngestionMeters;
|
import org.apache.druid.segment.incremental.SimpleRowIngestionMeters;
|
||||||
import org.apache.druid.testing.InitializedNullHandlingTest;
|
import org.apache.druid.testing.InitializedNullHandlingTest;
|
||||||
import org.apache.druid.timeline.partition.LinearShardSpec;
|
import org.apache.druid.timeline.partition.LinearShardSpec;
|
||||||
|
import org.joda.time.DateTime;
|
||||||
|
import org.joda.time.Interval;
|
||||||
|
import org.joda.time.chrono.ISOChronology;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@ -125,6 +128,72 @@ public class BatchAppenderatorTest extends InitializedNullHandlingTest
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the case when a segment identifier contains non UTC timestamps in its interval. This can happen
|
||||||
|
* when a custom segment granularity for an interval with a non UTC Chronlogy is created by
|
||||||
|
* {@link org.apache.druid.java.util.common.granularity.PeriodGranularity#bucketStart(DateTime)}
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testPeriodGranularityNonUTCIngestion() throws Exception
|
||||||
|
{
|
||||||
|
try (final BatchAppenderatorTester tester = new BatchAppenderatorTester(1, true)) {
|
||||||
|
final Appenderator appenderator = tester.getAppenderator();
|
||||||
|
|
||||||
|
// startJob
|
||||||
|
Assert.assertNull(appenderator.startJob());
|
||||||
|
|
||||||
|
// getDataSource
|
||||||
|
Assert.assertEquals(BatchAppenderatorTester.DATASOURCE, appenderator.getDataSource());
|
||||||
|
|
||||||
|
// Create a segment identifier with a non-utc interval
|
||||||
|
SegmentIdWithShardSpec segmentIdWithNonUTCTime =
|
||||||
|
createNonUTCSegmentId("2021-06-27T00:00:00.000+09:00/2021-06-28T00:00:00.000+09:00",
|
||||||
|
"A", 0); // should be in seg_0
|
||||||
|
|
||||||
|
Assert.assertEquals(
|
||||||
|
1,
|
||||||
|
appenderator.add(segmentIdWithNonUTCTime, createInputRow("2021-06-27T00:01:11.080Z", "foo", 1), null)
|
||||||
|
.getNumRowsInSegment()
|
||||||
|
);
|
||||||
|
|
||||||
|
// getSegments
|
||||||
|
Assert.assertEquals(
|
||||||
|
Collections.singletonList(segmentIdWithNonUTCTime),
|
||||||
|
appenderator.getSegments().stream().sorted().collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// since we just added one row and the max rows in memory is one, all the segments (sinks etc)
|
||||||
|
// above should be cleared now
|
||||||
|
Assert.assertEquals(
|
||||||
|
Collections.emptyList(),
|
||||||
|
((BatchAppenderator) appenderator).getInMemorySegments().stream().sorted().collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// push all
|
||||||
|
final SegmentsAndCommitMetadata segmentsAndCommitMetadata = appenderator.push(
|
||||||
|
appenderator.getSegments(),
|
||||||
|
null,
|
||||||
|
false
|
||||||
|
).get();
|
||||||
|
Assert.assertEquals(
|
||||||
|
Collections.singletonList(segmentIdWithNonUTCTime),
|
||||||
|
Lists.transform(
|
||||||
|
segmentsAndCommitMetadata.getSegments(),
|
||||||
|
SegmentIdWithShardSpec::fromDataSegment
|
||||||
|
).stream().sorted().collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
Assert.assertEquals(
|
||||||
|
tester.getPushedSegments().stream().sorted().collect(Collectors.toList()),
|
||||||
|
segmentsAndCommitMetadata.getSegments().stream().sorted().collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
|
||||||
|
appenderator.close();
|
||||||
|
Assert.assertTrue(appenderator.getSegments().isEmpty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSimpleIngestionWithFallbackCodePath() throws Exception
|
public void testSimpleIngestionWithFallbackCodePath() throws Exception
|
||||||
{
|
{
|
||||||
@ -887,7 +956,19 @@ public class BatchAppenderatorTest extends InitializedNullHandlingTest
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static SegmentIdWithShardSpec createNonUTCSegmentId(String interval, String version, int partitionNum)
|
||||||
|
{
|
||||||
|
return new SegmentIdWithShardSpec(
|
||||||
|
BatchAppenderatorTester.DATASOURCE,
|
||||||
|
new Interval(interval, ISOChronology.getInstance(DateTimes.inferTzFromString("Asia/Seoul"))),
|
||||||
|
version,
|
||||||
|
new LinearShardSpec(partitionNum)
|
||||||
|
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
private static SegmentIdWithShardSpec createSegmentId(String interval, String version, int partitionNum)
|
private static SegmentIdWithShardSpec createSegmentId(String interval, String version, int partitionNum)
|
||||||
{
|
{
|
||||||
return new SegmentIdWithShardSpec(
|
return new SegmentIdWithShardSpec(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user