HDFS-11015. Enforce timeout in balancer. Contributed by Kihwal Lee.

(cherry picked from commit f6367c5f44) (cherry picked from commit ff806cbfc7) (cherry picked from commit 262518fa5b)
2016-10-25 10:18:57 -07:00 · 2016-10-25 10:18:57 -07:00 · c05d701161
parent d05e737b84
commit c05d701161
5 changed files with 62 additions and 15 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -175,6 +175,8 @@ Release 2.7.4 - UNRELEASED
    HDFS-10627. Volume Scanner marks a block as "suspect" even if
    the exception is network-related. (Rushabh S Shah via kihwal)

+    HDFS-11015. Enforce timeout in balancer. (kihwal via zhz)
+
 Release 2.7.3 - 2016-08-25

  INCOMPATIBLE CHANGES
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@ -454,6 +454,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
  public static final long    DFS_BALANCER_GETBLOCKS_SIZE_DEFAULT = 2L*1024*1024*1024; // 2GB
  public static final String  DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_KEY = "dfs.balancer.getBlocks.min-block-size";
  public static final long    DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_DEFAULT = 10L*1024*1024; // 10MB
+  public static final String  DFS_BALANCER_BLOCK_MOVE_TIMEOUT = "dfs.balancer.block-move.timeout";
+  public static final int     DFS_BALANCER_BLOCK_MOVE_TIMEOUT_DEFAULT = 0;

  public static final String  DFS_MOVER_MOVEDWINWIDTH_KEY = "dfs.mover.movedWinWidth";
  public static final long    DFS_MOVER_MOVEDWINWIDTH_DEFAULT = 5400*1000L;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
@ -255,10 +255,14 @@ public class Balancer {
    final long getBlocksMinBlockSize = getLong(conf,
        DFSConfigKeys.DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_KEY,
        DFSConfigKeys.DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_DEFAULT);
+    final int blockMoveTimeout = conf.getInt(
+        DFSConfigKeys.DFS_BALANCER_BLOCK_MOVE_TIMEOUT,
+        DFSConfigKeys.DFS_BALANCER_BLOCK_MOVE_TIMEOUT_DEFAULT);

    this.dispatcher = new Dispatcher(theblockpool, p.nodesToBeIncluded,
        p.nodesToBeExcluded, movedWinWidth, moverThreads, dispatcherThreads,
-        maxConcurrentMovesPerNode, getBlocksSize, getBlocksMinBlockSize, conf);
+        maxConcurrentMovesPerNode, getBlocksSize, getBlocksMinBlockSize,
+        blockMoveTimeout, conf);
    this.threshold = p.threshold;
    this.policy = p.policy;

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java
@ -121,6 +121,7 @@ public class Dispatcher {

  private final long getBlocksSize;
  private final long getBlocksMinBlockSize;
+  private final long blockMoveTimeout;

  static class Allocator {
    private final int max;
@ -321,6 +322,11 @@ public class Dispatcher {
            NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()),
            HdfsServerConstants.READ_TIMEOUT);

+        // Set read timeout so that it doesn't hang forever against
+        // unresponsive nodes. Datanode normally sends IN_PROGRESS response
+        // twice within the client read timeout period (every 30 seconds by
+        // default). Here, we make it give up after 5 minutes of no response.
+        sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT * 5);
        sock.setKeepAlive(true);

        OutputStream unbufOut = sock.getOutputStream();
@ -375,13 +381,26 @@ public class Dispatcher {
          source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode);
    }

-    /** Receive a block copy response from the input stream */
+    /** Check whether to continue waiting for response */
+    private boolean stopWaitingForResponse(long startTime) {
+      return source.isIterationOver() ||
+          (blockMoveTimeout > 0 &&
+          (Time.monotonicNow() - startTime > blockMoveTimeout));
+    }
+
+    /** Receive a reportedBlock copy response from the input stream */
    private void receiveResponse(DataInputStream in) throws IOException {
+      long startTime = Time.monotonicNow();
      BlockOpResponseProto response =
          BlockOpResponseProto.parseFrom(vintPrefixed(in));
      while (response.getStatus() == Status.IN_PROGRESS) {
        // read intermediate responses
        response = BlockOpResponseProto.parseFrom(vintPrefixed(in));
+        // Stop waiting for slow block moves. Even if it stops waiting,
+        // the actual move may continue.
+        if (stopWaitingForResponse(startTime)) {
+          throw new IOException("Block move timed out");
+        }
      }
      String logInfo = "block move is failed";
      DataTransferProtoUtil.checkBlockOpStatus(response, logInfo);
@ -614,6 +633,7 @@ public class Dispatcher {

    private final List<Task> tasks = new ArrayList<Task>(2);
    private long blocksToReceive = 0L;
+    private final long startTime = Time.monotonicNow();
    /**
     * Source blocks point to the objects in {@link Dispatcher#globalBlocks}
     * because we want to keep one copy of a block and be aware that the
@ -625,6 +645,13 @@ public class Dispatcher {
      dn.super(storageType, maxSize2Move);
    }

+    /**
+     * Check if the iteration is over
+     */
+    public boolean isIterationOver() {
+      return (Time.monotonicNow()-startTime > MAX_ITERATION_TIME);
+    }
+
    /** Add a task */
    void addTask(Task task) {
      Preconditions.checkState(task.target != this,
@ -765,11 +792,9 @@ public class Dispatcher {
     * elapsed time of the iteration has exceeded the max time limit.
     */
    private void dispatchBlocks() {
-      final long startTime = Time.monotonicNow();
      this.blocksToReceive = 2 * getScheduledSize();
-      boolean isTimeUp = false;
      int noPendingMoveIteration = 0;
-      while (!isTimeUp && getScheduledSize() > 0
+      while (getScheduledSize() > 0 && !isIterationOver()
          && (!srcBlocks.isEmpty() || blocksToReceive > 0)) {
        if (LOG.isTraceEnabled()) {
          LOG.trace(this + " blocksToReceive=" + blocksToReceive
@ -809,14 +834,6 @@ public class Dispatcher {
          }
        }

-        // check if time is up or not
-        if (Time.monotonicNow() - startTime > MAX_ITERATION_TIME) {
-          LOG.info("Time up (max time=" + MAX_ITERATION_TIME/1000
-              + " seconds).  Skipping " + this);
-          isTimeUp = true;
-          continue;
-        }
-
        // Now we can not schedule any block to move and there are
        // no new blocks added to the source block list, so we wait.
        try {
@ -826,6 +843,11 @@ public class Dispatcher {
        } catch (InterruptedException ignored) {
        }
      }
+
+      if (isIterationOver()) {
+        LOG.info("The maximum iteration time (" + MAX_ITERATION_TIME/1000
+            + " seconds) has been reached. Stopping " + this);
+      }
    }

    @Override
@ -845,13 +867,14 @@ public class Dispatcher {
      int dispatcherThreads, int maxConcurrentMovesPerNode, Configuration conf) {
    this(nnc, includedNodes, excludedNodes, movedWinWidth,
        moverThreads, dispatcherThreads, maxConcurrentMovesPerNode,
-        0L, 0L, conf);
+        0L, 0L, 0,  conf);
  }

  Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,
      Set<String> excludedNodes, long movedWinWidth, int moverThreads,
      int dispatcherThreads, int maxConcurrentMovesPerNode,
-      long getBlocksSize, long getBlocksMinBlockSize, Configuration conf) {
+      long getBlocksSize, long getBlocksMinBlockSize,
+      int blockMoveTimeout, Configuration conf) {
    this.nnc = nnc;
    this.excludedNodes = excludedNodes;
    this.includedNodes = includedNodes;
@ -866,6 +889,7 @@ public class Dispatcher {

    this.getBlocksSize = getBlocksSize;
    this.getBlocksMinBlockSize = getBlocksMinBlockSize;
+    this.blockMoveTimeout = blockMoveTimeout;

    this.saslClient = new SaslDataTransferClient(conf,
        DataTransferSaslUtil.getSaslPropertiesResolver(conf),
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
@ -2531,6 +2531,21 @@
    </description>
  </property>

+  <property>
+    <name>dfs.balancer.block-move.timeout</name>
+    <value>0</value>
+    <description>
+      Maximum amount of time in milliseconds for a block to move. If this is set
+      greater than 0, Balancer will stop waiting for a block move completion
+      after this time. In typical clusters, a 3 to 5 minute timeout is reasonable.
+      If timeout happens to a large proportion of block moves, this needs to be
+      increased. It could also be that too much work is dispatched and many nodes
+      are constantly exceeding the bandwidth limit as a result. In that case,
+      other balancer parameters might need to be adjusted.
+      It is disabled (0) by default.
+    </description>
+  </property>
+
  <property>
    <name>dfs.lock.suppress.warning.interval</name>
    <value>10s</value>