HDFS-6166. Change Balancer socket read timeout to 20 minutes and add 10 seconds delay after error. Contributed by Nathan Roberts

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1583018 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2014-03-29 16:23:00 +00:00
parent d46b2d0eb3
commit b4c41f341b
2 changed files with 34 additions and 2 deletions

View File

@ -735,6 +735,9 @@ Release 2.4.0 - UNRELEASED
HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via
jing9) jing9)
HDFS-6166. Change Balancer socket read timeout to 20 minutes and add
10 seconds delay after error. (Nathan Roberts via szetszwo)
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9) HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)

View File

@ -190,6 +190,8 @@ public class Balancer {
*/ */
public static final int MAX_NUM_CONCURRENT_MOVES = 5; public static final int MAX_NUM_CONCURRENT_MOVES = 5;
private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5; private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
public static final long DELAY_AFTER_ERROR = 10 * 1000L; //10 seconds
public static final int BLOCK_MOVE_READ_TIMEOUT=20*60*1000; // 20 minutes
private static final String USAGE = "Usage: java " private static final String USAGE = "Usage: java "
+ Balancer.class.getSimpleName() + Balancer.class.getSimpleName()
@ -337,7 +339,14 @@ public class Balancer {
sock.connect( sock.connect(
NetUtils.createSocketAddr(target.datanode.getXferAddr()), NetUtils.createSocketAddr(target.datanode.getXferAddr()),
HdfsServerConstants.READ_TIMEOUT); HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT); /* Unfortunately we don't have a good way to know if the Datanode is
* taking a really long time to move a block, OR something has
* gone wrong and it's never going to finish. To deal with this
* scenario, we set a long timeout (20 minutes) to avoid hanging
* the balancer indefinitely.
*/
sock.setSoTimeout(BLOCK_MOVE_READ_TIMEOUT);
sock.setKeepAlive(true); sock.setKeepAlive(true);
OutputStream unbufOut = sock.getOutputStream(); OutputStream unbufOut = sock.getOutputStream();
@ -360,6 +369,13 @@ public class Balancer {
LOG.info("Successfully moved " + this); LOG.info("Successfully moved " + this);
} catch (IOException e) { } catch (IOException e) {
LOG.warn("Failed to move " + this + ": " + e.getMessage()); LOG.warn("Failed to move " + this + ": " + e.getMessage());
/* proxy or target may have an issue, insert a small delay
* before using these nodes further. This avoids a potential storm
* of "threads quota exceeded" Warnings when the balancer
* gets out of sync with work going on in datanode.
*/
proxySource.activateDelay(DELAY_AFTER_ERROR);
target.activateDelay(DELAY_AFTER_ERROR);
} finally { } finally {
IOUtils.closeStream(out); IOUtils.closeStream(out);
IOUtils.closeStream(in); IOUtils.closeStream(in);
@ -497,6 +513,7 @@ public class Balancer {
final double utilization; final double utilization;
final long maxSize2Move; final long maxSize2Move;
private long scheduledSize = 0L; private long scheduledSize = 0L;
protected long delayUntil = 0L;
// blocks being moved but not confirmed yet // blocks being moved but not confirmed yet
private final List<PendingBlockMove> pendingBlocks = private final List<PendingBlockMove> pendingBlocks =
new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES); new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES);
@ -574,6 +591,18 @@ public class Balancer {
scheduledSize = size; scheduledSize = size;
} }
synchronized private void activateDelay(long delta) {
delayUntil = Time.now() + delta;
}
synchronized private boolean isDelayActive() {
if (delayUntil == 0 || Time.now() > delayUntil){
delayUntil = 0;
return false;
}
return true;
}
/* Check if the node can schedule more blocks to move */ /* Check if the node can schedule more blocks to move */
synchronized private boolean isPendingQNotFull() { synchronized private boolean isPendingQNotFull() {
if ( pendingBlocks.size() < MAX_NUM_CONCURRENT_MOVES ) { if ( pendingBlocks.size() < MAX_NUM_CONCURRENT_MOVES ) {
@ -590,7 +619,7 @@ public class Balancer {
/* Add a scheduled block move to the node */ /* Add a scheduled block move to the node */
private synchronized boolean addPendingBlock( private synchronized boolean addPendingBlock(
PendingBlockMove pendingBlock) { PendingBlockMove pendingBlock) {
if (isPendingQNotFull()) { if (!isDelayActive() && isPendingQNotFull()) {
return pendingBlocks.add(pendingBlock); return pendingBlocks.add(pendingBlock);
} }
return false; return false;