diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java index 64c79878408..272edaff1f3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java @@ -118,6 +118,11 @@ public final class OzoneConfigKeys { public static final int OZONE_BLOCK_DELETING_SERVICE_INTERVAL_MS_DEFAULT = 60000; + public static final String OZONE_BLOCK_DELETING_SERVICE_TIMEOUT = + "ozone.block.deleting.service.timeout"; + public static final int OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT + = 300000; // 300s for default + public static final String OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER = "ozone.block.deleting.limit.per.task"; public static final int OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER_DEFAULT diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/background/BlockDeletingService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/background/BlockDeletingService.java index df59f68711d..498264ef0f0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/background/BlockDeletingService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/background/BlockDeletingService.java @@ -79,9 +79,10 @@ public class BlockDeletingService extends BackgroundService{ private final static int BLOCK_DELETING_SERVICE_CORE_POOL_SIZE = 10; public BlockDeletingService(ContainerManager containerManager, - int serviceInterval, Configuration conf) { + int serviceInterval, long serviceTimeout, Configuration conf) { super("BlockDeletingService", serviceInterval, - TimeUnit.MILLISECONDS, BLOCK_DELETING_SERVICE_CORE_POOL_SIZE); + TimeUnit.MILLISECONDS, BLOCK_DELETING_SERVICE_CORE_POOL_SIZE, + serviceTimeout); this.containerManager = containerManager; this.conf = conf; this.blockLimitPerTask = conf.getInt( diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java index 307f59fb54a..a4904b2eddb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java @@ -46,6 +46,7 @@ import java.nio.file.Paths; import java.util.LinkedList; import java.util.List; +import java.util.concurrent.TimeUnit; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY; import static org.apache.hadoop.ozone.OzoneConsts.CONTAINER_ROOT_PREFIX; @@ -54,6 +55,8 @@ import static org.apache.hadoop.ozone.OzoneConfigKeys .OZONE_BLOCK_DELETING_SERVICE_INTERVAL_MS_DEFAULT; import static org.apache.hadoop.ozone.OzoneConsts.INVALID_PORT; +import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT; +import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT; /** * Ozone main class sets up the network server and initializes the container @@ -103,8 +106,11 @@ public OzoneContainer(DatanodeID datanodeID, Configuration ozoneConfig) throws int svcInterval = ozoneConfig.getInt( OZONE_BLOCK_DELETING_SERVICE_INTERVAL_MS, OZONE_BLOCK_DELETING_SERVICE_INTERVAL_MS_DEFAULT); + long serviceTimeout = ozoneConfig.getTimeDuration( + OZONE_BLOCK_DELETING_SERVICE_TIMEOUT, + OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT, TimeUnit.MILLISECONDS); this.blockDeletingService = new BlockDeletingService(manager, - svcInterval, ozoneConfig); + svcInterval, serviceTimeout, ozoneConfig); this.dispatcher = new Dispatcher(manager, this.ozoneConfig); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/utils/BackgroundService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/utils/BackgroundService.java index b057533cf5d..53f3a894d96 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/utils/BackgroundService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/utils/BackgroundService.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadFactory; import java.util.concurrent.Executors; @@ -42,7 +43,8 @@ */ public abstract class BackgroundService { - private static final Logger LOG = + @VisibleForTesting + public static final Logger LOG = LoggerFactory.getLogger(BackgroundService.class); // Executor to launch child tasks @@ -51,13 +53,15 @@ public abstract class BackgroundService { private final ThreadFactory threadFactory; private final String serviceName; private final int interval; + private final long serviceTimeout; private final TimeUnit unit; public BackgroundService(String serviceName, int interval, - TimeUnit unit, int threadPoolSize) { + TimeUnit unit, int threadPoolSize, long serviceTimeout) { this.interval = interval; this.unit = unit; this.serviceName = serviceName; + this.serviceTimeout = serviceTimeout; threadGroup = new ThreadGroup(serviceName); ThreadFactory tf = r -> new Thread(threadGroup, r); threadFactory = new ThreadFactoryBuilder() @@ -115,8 +119,9 @@ public void run() { results.parallelStream().forEach(taskResultFuture -> { try { // Collect task results - // TODO timeout in case task hangs - BackgroundTaskResult result = taskResultFuture.get(); + BackgroundTaskResult result = serviceTimeout > 0 + ? taskResultFuture.get(serviceTimeout, TimeUnit.MILLISECONDS) + : taskResultFuture.get(); if (LOG.isDebugEnabled()) { LOG.debug("task execution result size {}", result.getSize()); } @@ -124,6 +129,9 @@ public void run() { LOG.warn( "Background task fails to execute, " + "retrying in next interval", e); + } catch (TimeoutException e) { + LOG.warn("Background task executes timed out, " + + "retrying in next interval", e); } }); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/ozone-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/ozone-default.xml index 046cf5a33bc..e39a5ec97b6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/ozone-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/ozone-default.xml @@ -369,6 +369,19 @@ + + ozone.block.deleting.service.timeout + 300000ms + + Timeout value of block deletion service. If this is set greater than 0, + the service will stop waiting for the block deleting completion after this + time. If timeout happens to a large proportion of block deletion, this needs + to be increased with ozone.block.deleting.limit.per.task. This setting supports + multiple time unit suffixes as described in dfs.heartbeat.interval. If no suffix + is specified then milliseconds is assumed. + + + ozone.block.deleting.limit.per.task 1000 diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/TestUtils/BlockDeletingServiceTestImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/TestUtils/BlockDeletingServiceTestImpl.java index 0fde964a7a6..78082894b1c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/TestUtils/BlockDeletingServiceTestImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/TestUtils/BlockDeletingServiceTestImpl.java @@ -33,6 +33,9 @@ public class BlockDeletingServiceTestImpl extends BlockDeletingService { + // the service timeout + private static final int SERVICE_TIMEOUT_IN_MILLISECONDS = 0; + // tests only private CountDownLatch latch; private Thread testingThread; @@ -40,7 +43,8 @@ public class BlockDeletingServiceTestImpl public BlockDeletingServiceTestImpl(ContainerManager containerManager, int serviceInterval, Configuration conf) { - super(containerManager, serviceInterval, conf); + super(containerManager, serviceInterval, + SERVICE_TIMEOUT_IN_MILLISECONDS, conf); } @VisibleForTesting diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java index 2d4b5b2807f..ceeb6596048 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java @@ -32,8 +32,11 @@ import org.apache.hadoop.ozone.container.common.helpers.KeyUtils; import org.apache.hadoop.ozone.container.common.impl.ContainerManagerImpl; import org.apache.hadoop.ozone.container.common.interfaces.ContainerManager; +import org.apache.hadoop.ozone.container.common.statemachine.background.BlockDeletingService; import org.apache.hadoop.ozone.web.utils.OzoneUtils; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.GenericTestUtils.LogCapturer; +import org.apache.hadoop.utils.BackgroundService; import org.apache.hadoop.utils.MetadataKeyFilters; import org.apache.hadoop.utils.MetadataStore; import org.junit.Assert; @@ -235,6 +238,64 @@ public void testShutdownService() throws Exception { GenericTestUtils.waitFor(() -> service.getThreadCount() == 0, 100, 1000); } + @Test + public void testBlockDeletionTimeout() throws Exception { + Configuration conf = new OzoneConfiguration(); + conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10); + conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 2); + ContainerManager containerManager = createContainerManager(conf); + createToDeleteBlocks(containerManager, conf, 1, 3, 1, chunksDir); + + // set timeout value as 1ms to trigger timeout behavior + long timeout = 1; + BlockDeletingService svc = + new BlockDeletingService(containerManager, 1000, timeout, conf); + svc.start(); + + LogCapturer log = LogCapturer.captureLogs(BackgroundService.LOG); + GenericTestUtils.waitFor(() -> { + if(log.getOutput().contains( + "Background task executes timed out, retrying in next interval")) { + log.stopCapturing(); + return true; + } + + return false; + }, 1000, 100000); + + log.stopCapturing(); + svc.shutdown(); + + // test for normal case that doesn't have timeout limitation + timeout = 0; + createToDeleteBlocks(containerManager, conf, 1, 3, 1, chunksDir); + svc = new BlockDeletingService(containerManager, 1000, timeout, conf); + svc.start(); + + // get container meta data + List containerData = Lists.newArrayList(); + containerManager.listContainer(null, 1, "", containerData); + MetadataStore meta = KeyUtils.getDB(containerData.get(0), conf); + + LogCapturer newLog = LogCapturer.captureLogs(BackgroundService.LOG); + GenericTestUtils.waitFor(() -> { + try { + if (getUnderDeletionBlocksCount(meta) == 0) { + return true; + } + } catch (IOException ignored) { + } + return false; + }, 1000, 100000); + newLog.stopCapturing(); + + // The block deleting successfully and shouldn't catch timed + // out warning log. + Assert.assertTrue(!newLog.getOutput().contains( + "Background task executes timed out, retrying in next interval")); + svc.shutdown(); + } + @Test(timeout = 30000) public void testContainerThrottle() throws Exception { // Properties :