HDFS-6166. Change Balancer socket read timeout to 20 minutes and add 10 seconds delay after error. Contributed by Nathan Roberts
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1583018 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d46b2d0eb3
commit
b4c41f341b
|
@ -735,6 +735,9 @@ Release 2.4.0 - UNRELEASED
|
||||||
HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via
|
HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via
|
||||||
jing9)
|
jing9)
|
||||||
|
|
||||||
|
HDFS-6166. Change Balancer socket read timeout to 20 minutes and add
|
||||||
|
10 seconds delay after error. (Nathan Roberts via szetszwo)
|
||||||
|
|
||||||
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
|
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
|
||||||
|
|
||||||
HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)
|
HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)
|
||||||
|
|
|
@ -190,6 +190,8 @@ public class Balancer {
|
||||||
*/
|
*/
|
||||||
public static final int MAX_NUM_CONCURRENT_MOVES = 5;
|
public static final int MAX_NUM_CONCURRENT_MOVES = 5;
|
||||||
private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
|
private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
|
||||||
|
public static final long DELAY_AFTER_ERROR = 10 * 1000L; //10 seconds
|
||||||
|
public static final int BLOCK_MOVE_READ_TIMEOUT=20*60*1000; // 20 minutes
|
||||||
|
|
||||||
private static final String USAGE = "Usage: java "
|
private static final String USAGE = "Usage: java "
|
||||||
+ Balancer.class.getSimpleName()
|
+ Balancer.class.getSimpleName()
|
||||||
|
@ -337,7 +339,14 @@ public class Balancer {
|
||||||
sock.connect(
|
sock.connect(
|
||||||
NetUtils.createSocketAddr(target.datanode.getXferAddr()),
|
NetUtils.createSocketAddr(target.datanode.getXferAddr()),
|
||||||
HdfsServerConstants.READ_TIMEOUT);
|
HdfsServerConstants.READ_TIMEOUT);
|
||||||
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
|
/* Unfortunately we don't have a good way to know if the Datanode is
|
||||||
|
* taking a really long time to move a block, OR something has
|
||||||
|
* gone wrong and it's never going to finish. To deal with this
|
||||||
|
* scenario, we set a long timeout (20 minutes) to avoid hanging
|
||||||
|
* the balancer indefinitely.
|
||||||
|
*/
|
||||||
|
sock.setSoTimeout(BLOCK_MOVE_READ_TIMEOUT);
|
||||||
|
|
||||||
sock.setKeepAlive(true);
|
sock.setKeepAlive(true);
|
||||||
|
|
||||||
OutputStream unbufOut = sock.getOutputStream();
|
OutputStream unbufOut = sock.getOutputStream();
|
||||||
|
@ -360,6 +369,13 @@ public class Balancer {
|
||||||
LOG.info("Successfully moved " + this);
|
LOG.info("Successfully moved " + this);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.warn("Failed to move " + this + ": " + e.getMessage());
|
LOG.warn("Failed to move " + this + ": " + e.getMessage());
|
||||||
|
/* proxy or target may have an issue, insert a small delay
|
||||||
|
* before using these nodes further. This avoids a potential storm
|
||||||
|
* of "threads quota exceeded" Warnings when the balancer
|
||||||
|
* gets out of sync with work going on in datanode.
|
||||||
|
*/
|
||||||
|
proxySource.activateDelay(DELAY_AFTER_ERROR);
|
||||||
|
target.activateDelay(DELAY_AFTER_ERROR);
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeStream(out);
|
IOUtils.closeStream(out);
|
||||||
IOUtils.closeStream(in);
|
IOUtils.closeStream(in);
|
||||||
|
@ -497,6 +513,7 @@ public class Balancer {
|
||||||
final double utilization;
|
final double utilization;
|
||||||
final long maxSize2Move;
|
final long maxSize2Move;
|
||||||
private long scheduledSize = 0L;
|
private long scheduledSize = 0L;
|
||||||
|
protected long delayUntil = 0L;
|
||||||
// blocks being moved but not confirmed yet
|
// blocks being moved but not confirmed yet
|
||||||
private final List<PendingBlockMove> pendingBlocks =
|
private final List<PendingBlockMove> pendingBlocks =
|
||||||
new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES);
|
new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES);
|
||||||
|
@ -573,6 +590,18 @@ public class Balancer {
|
||||||
protected synchronized void setScheduledSize(long size){
|
protected synchronized void setScheduledSize(long size){
|
||||||
scheduledSize = size;
|
scheduledSize = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized private void activateDelay(long delta) {
|
||||||
|
delayUntil = Time.now() + delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized private boolean isDelayActive() {
|
||||||
|
if (delayUntil == 0 || Time.now() > delayUntil){
|
||||||
|
delayUntil = 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Check if the node can schedule more blocks to move */
|
/* Check if the node can schedule more blocks to move */
|
||||||
synchronized private boolean isPendingQNotFull() {
|
synchronized private boolean isPendingQNotFull() {
|
||||||
|
@ -590,7 +619,7 @@ public class Balancer {
|
||||||
/* Add a scheduled block move to the node */
|
/* Add a scheduled block move to the node */
|
||||||
private synchronized boolean addPendingBlock(
|
private synchronized boolean addPendingBlock(
|
||||||
PendingBlockMove pendingBlock) {
|
PendingBlockMove pendingBlock) {
|
||||||
if (isPendingQNotFull()) {
|
if (!isDelayActive() && isPendingQNotFull()) {
|
||||||
return pendingBlocks.add(pendingBlock);
|
return pendingBlocks.add(pendingBlock);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
Loading…
Reference in New Issue