HDFS-8397. Refactor the error handling code in DataStreamer. Contributed by Tsz Wo Nicholas Sze.
(cherry picked from commit 8f37873342
)
This commit is contained in:
parent
ce64720516
commit
d6aa65d037
|
@ -221,6 +221,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
|
|
||||||
HDFS-6888. Allow selectively audit logging ops (Chen He via vinayakumarb)
|
HDFS-6888. Allow selectively audit logging ops (Chen He via vinayakumarb)
|
||||||
|
|
||||||
|
HDFS-8397. Refactor the error handling code in DataStreamer.
|
||||||
|
(Tsz Wo Nicholas Sze via jing9)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
||||||
|
|
|
@ -38,7 +38,6 @@ import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
|
@ -46,6 +45,7 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.fs.StorageType;
|
import org.apache.hadoop.fs.StorageType;
|
||||||
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
|
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
|
||||||
|
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.BlockWrite;
|
||||||
import org.apache.hadoop.hdfs.client.impl.DfsClientConf;
|
import org.apache.hadoop.hdfs.client.impl.DfsClientConf;
|
||||||
import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
|
import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
|
||||||
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
|
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
|
||||||
|
@ -208,6 +208,133 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class ErrorState {
|
||||||
|
private boolean error = false;
|
||||||
|
private int badNodeIndex = -1;
|
||||||
|
private int restartingNodeIndex = -1;
|
||||||
|
private long restartingNodeDeadline = 0;
|
||||||
|
private final long datanodeRestartTimeout;
|
||||||
|
|
||||||
|
ErrorState(long datanodeRestartTimeout) {
|
||||||
|
this.datanodeRestartTimeout = datanodeRestartTimeout;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void reset() {
|
||||||
|
error = false;
|
||||||
|
badNodeIndex = -1;
|
||||||
|
restartingNodeIndex = -1;
|
||||||
|
restartingNodeDeadline = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized boolean hasError() {
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized boolean hasDatanodeError() {
|
||||||
|
return error && isNodeMarked();
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void setError(boolean err) {
|
||||||
|
this.error = err;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void setBadNodeIndex(int index) {
|
||||||
|
this.badNodeIndex = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized int getBadNodeIndex() {
|
||||||
|
return badNodeIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized int getRestartingNodeIndex() {
|
||||||
|
return restartingNodeIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void initRestartingNode(int i, String message) {
|
||||||
|
restartingNodeIndex = i;
|
||||||
|
restartingNodeDeadline = Time.monotonicNow() + datanodeRestartTimeout;
|
||||||
|
// If the data streamer has already set the primary node
|
||||||
|
// bad, clear it. It is likely that the write failed due to
|
||||||
|
// the DN shutdown. Even if it was a real failure, the pipeline
|
||||||
|
// recovery will take care of it.
|
||||||
|
badNodeIndex = -1;
|
||||||
|
LOG.info(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized boolean isRestartingNode() {
|
||||||
|
return restartingNodeIndex >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized boolean isNodeMarked() {
|
||||||
|
return badNodeIndex >= 0 || isRestartingNode();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is used when no explicit error report was received, but
|
||||||
|
* something failed. The first node is a suspect or unsure about the cause
|
||||||
|
* so that it is marked as failed.
|
||||||
|
*/
|
||||||
|
synchronized void markFirstNodeIfNotMarked() {
|
||||||
|
// There should be no existing error and no ongoing restart.
|
||||||
|
if (!isNodeMarked()) {
|
||||||
|
badNodeIndex = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void adjustState4RestartingNode() {
|
||||||
|
// Just took care of a node error while waiting for a node restart
|
||||||
|
if (restartingNodeIndex >= 0) {
|
||||||
|
// If the error came from a node further away than the restarting
|
||||||
|
// node, the restart must have been complete.
|
||||||
|
if (badNodeIndex > restartingNodeIndex) {
|
||||||
|
restartingNodeIndex = -1;
|
||||||
|
} else if (badNodeIndex < restartingNodeIndex) {
|
||||||
|
// the node index has shifted.
|
||||||
|
restartingNodeIndex--;
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException("badNodeIndex = " + badNodeIndex
|
||||||
|
+ " = restartingNodeIndex = " + restartingNodeIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRestartingNode()) {
|
||||||
|
error = false;
|
||||||
|
}
|
||||||
|
badNodeIndex = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void checkRestartingNodeDeadline(DatanodeInfo[] nodes) {
|
||||||
|
if (restartingNodeIndex >= 0) {
|
||||||
|
if (!error) {
|
||||||
|
throw new IllegalStateException("error=false while checking" +
|
||||||
|
" restarting node deadline");
|
||||||
|
}
|
||||||
|
|
||||||
|
// check badNodeIndex
|
||||||
|
if (badNodeIndex == restartingNodeIndex) {
|
||||||
|
// ignore, if came from the restarting node
|
||||||
|
badNodeIndex = -1;
|
||||||
|
}
|
||||||
|
// not within the deadline
|
||||||
|
if (Time.monotonicNow() >= restartingNodeDeadline) {
|
||||||
|
// expired. declare the restarting node dead
|
||||||
|
restartingNodeDeadline = 0;
|
||||||
|
final int i = restartingNodeIndex;
|
||||||
|
restartingNodeIndex = -1;
|
||||||
|
LOG.warn("Datanode " + i + " did not restart within "
|
||||||
|
+ datanodeRestartTimeout + "ms: " + nodes[i]);
|
||||||
|
// Mark the restarting node as failed. If there is any other failed
|
||||||
|
// node during the last pipeline construction attempt, it will not be
|
||||||
|
// overwritten/dropped. In this case, the restarting node will get
|
||||||
|
// excluded in the following attempt, if it still does not come up.
|
||||||
|
if (badNodeIndex == -1) {
|
||||||
|
badNodeIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private volatile boolean streamerClosed = false;
|
private volatile boolean streamerClosed = false;
|
||||||
private ExtendedBlock block; // its length is number of bytes acked
|
private ExtendedBlock block; // its length is number of bytes acked
|
||||||
private Token<BlockTokenIdentifier> accessToken;
|
private Token<BlockTokenIdentifier> accessToken;
|
||||||
|
@ -217,11 +344,8 @@ class DataStreamer extends Daemon {
|
||||||
private volatile DatanodeInfo[] nodes = null; // list of targets for current block
|
private volatile DatanodeInfo[] nodes = null; // list of targets for current block
|
||||||
private volatile StorageType[] storageTypes = null;
|
private volatile StorageType[] storageTypes = null;
|
||||||
private volatile String[] storageIDs = null;
|
private volatile String[] storageIDs = null;
|
||||||
volatile boolean hasError = false;
|
private final ErrorState errorState;
|
||||||
volatile int errorIndex = -1;
|
|
||||||
// Restarting node index
|
|
||||||
AtomicInteger restartingNodeIndex = new AtomicInteger(-1);
|
|
||||||
private long restartDeadline = 0; // Deadline of DN restart
|
|
||||||
private BlockConstructionStage stage; // block construction stage
|
private BlockConstructionStage stage; // block construction stage
|
||||||
private long bytesSent = 0; // number of bytes that've been sent
|
private long bytesSent = 0; // number of bytes that've been sent
|
||||||
private final boolean isLazyPersistFile;
|
private final boolean isLazyPersistFile;
|
||||||
|
@ -287,11 +411,13 @@ class DataStreamer extends Daemon {
|
||||||
this.cachingStrategy = cachingStrategy;
|
this.cachingStrategy = cachingStrategy;
|
||||||
this.byteArrayManager = byteArrayManage;
|
this.byteArrayManager = byteArrayManage;
|
||||||
this.isLazyPersistFile = isLazyPersist(stat);
|
this.isLazyPersistFile = isLazyPersist(stat);
|
||||||
this.dfsclientSlowLogThresholdMs =
|
|
||||||
dfsClient.getConf().getSlowIoWarningThresholdMs();
|
|
||||||
this.excludedNodes = initExcludedNodes();
|
|
||||||
this.isAppend = isAppend;
|
this.isAppend = isAppend;
|
||||||
this.favoredNodes = favoredNodes;
|
this.favoredNodes = favoredNodes;
|
||||||
|
|
||||||
|
final DfsClientConf conf = dfsClient.getConf();
|
||||||
|
this.dfsclientSlowLogThresholdMs = conf.getSlowIoWarningThresholdMs();
|
||||||
|
this.excludedNodes = initExcludedNodes(conf.getExcludedNodesCacheExpiry());
|
||||||
|
this.errorState = new ErrorState(conf.getDatanodeRestartTimeout());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -334,7 +460,6 @@ class DataStreamer extends Daemon {
|
||||||
void setPipelineInConstruction(LocatedBlock lastBlock) throws IOException{
|
void setPipelineInConstruction(LocatedBlock lastBlock) throws IOException{
|
||||||
// setup pipeline to append to the last block XXX retries??
|
// setup pipeline to append to the last block XXX retries??
|
||||||
setPipeline(lastBlock);
|
setPipeline(lastBlock);
|
||||||
errorIndex = -1; // no errors yet.
|
|
||||||
if (nodes.length < 1) {
|
if (nodes.length < 1) {
|
||||||
throw new IOException("Unable to retrieve blocks locations " +
|
throw new IOException("Unable to retrieve blocks locations " +
|
||||||
" for last block " + block +
|
" for last block " + block +
|
||||||
|
@ -375,6 +500,10 @@ class DataStreamer extends Daemon {
|
||||||
stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
|
stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean shouldStop() {
|
||||||
|
return streamerClosed || errorState.hasError() || !dfsClient.clientRunning;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* streamer thread is the only thread that opens streams to datanode,
|
* streamer thread is the only thread that opens streams to datanode,
|
||||||
* and closes them. Any error recovery is also done by this thread.
|
* and closes them. Any error recovery is also done by this thread.
|
||||||
|
@ -385,7 +514,7 @@ class DataStreamer extends Daemon {
|
||||||
TraceScope scope = NullScope.INSTANCE;
|
TraceScope scope = NullScope.INSTANCE;
|
||||||
while (!streamerClosed && dfsClient.clientRunning) {
|
while (!streamerClosed && dfsClient.clientRunning) {
|
||||||
// if the Responder encountered an error, shutdown Responder
|
// if the Responder encountered an error, shutdown Responder
|
||||||
if (hasError && response != null) {
|
if (errorState.hasError() && response != null) {
|
||||||
try {
|
try {
|
||||||
response.close();
|
response.close();
|
||||||
response.join();
|
response.join();
|
||||||
|
@ -398,17 +527,13 @@ class DataStreamer extends Daemon {
|
||||||
DFSPacket one;
|
DFSPacket one;
|
||||||
try {
|
try {
|
||||||
// process datanode IO errors if any
|
// process datanode IO errors if any
|
||||||
boolean doSleep = false;
|
boolean doSleep = processDatanodeError();
|
||||||
if (hasError && (errorIndex >= 0 || restartingNodeIndex.get() >= 0)) {
|
|
||||||
doSleep = processDatanodeError();
|
|
||||||
}
|
|
||||||
|
|
||||||
final int halfSocketTimeout = dfsClient.getConf().getSocketTimeout()/2;
|
final int halfSocketTimeout = dfsClient.getConf().getSocketTimeout()/2;
|
||||||
synchronized (dataQueue) {
|
synchronized (dataQueue) {
|
||||||
// wait for a packet to be sent.
|
// wait for a packet to be sent.
|
||||||
long now = Time.monotonicNow();
|
long now = Time.monotonicNow();
|
||||||
while ((!streamerClosed && !hasError && dfsClient.clientRunning
|
while ((!shouldStop() && dataQueue.size() == 0 &&
|
||||||
&& dataQueue.size() == 0 &&
|
|
||||||
(stage != BlockConstructionStage.DATA_STREAMING ||
|
(stage != BlockConstructionStage.DATA_STREAMING ||
|
||||||
stage == BlockConstructionStage.DATA_STREAMING &&
|
stage == BlockConstructionStage.DATA_STREAMING &&
|
||||||
now - lastPacket < halfSocketTimeout)) || doSleep ) {
|
now - lastPacket < halfSocketTimeout)) || doSleep ) {
|
||||||
|
@ -424,13 +549,12 @@ class DataStreamer extends Daemon {
|
||||||
doSleep = false;
|
doSleep = false;
|
||||||
now = Time.monotonicNow();
|
now = Time.monotonicNow();
|
||||||
}
|
}
|
||||||
if (streamerClosed || hasError || !dfsClient.clientRunning) {
|
if (shouldStop()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// get packet to be sent.
|
// get packet to be sent.
|
||||||
if (dataQueue.isEmpty()) {
|
if (dataQueue.isEmpty()) {
|
||||||
one = createHeartbeatPacket();
|
one = createHeartbeatPacket();
|
||||||
assert one != null;
|
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
backOffIfNecessary();
|
backOffIfNecessary();
|
||||||
|
@ -460,7 +584,7 @@ class DataStreamer extends Daemon {
|
||||||
LOG.debug("Append to block " + block);
|
LOG.debug("Append to block " + block);
|
||||||
}
|
}
|
||||||
setupPipelineForAppendOrRecovery();
|
setupPipelineForAppendOrRecovery();
|
||||||
if (true == streamerClosed) {
|
if (streamerClosed) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
initDataStreaming();
|
initDataStreaming();
|
||||||
|
@ -478,8 +602,7 @@ class DataStreamer extends Daemon {
|
||||||
if (one.isLastPacketInBlock()) {
|
if (one.isLastPacketInBlock()) {
|
||||||
// wait for all data packets have been successfully acked
|
// wait for all data packets have been successfully acked
|
||||||
synchronized (dataQueue) {
|
synchronized (dataQueue) {
|
||||||
while (!streamerClosed && !hasError &&
|
while (!shouldStop() && ackQueue.size() != 0) {
|
||||||
ackQueue.size() != 0 && dfsClient.clientRunning) {
|
|
||||||
try {
|
try {
|
||||||
// wait for acks to arrive from datanodes
|
// wait for acks to arrive from datanodes
|
||||||
dataQueue.wait(1000);
|
dataQueue.wait(1000);
|
||||||
|
@ -488,7 +611,7 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (streamerClosed || hasError || !dfsClient.clientRunning) {
|
if (shouldStop()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
stage = BlockConstructionStage.PIPELINE_CLOSE;
|
stage = BlockConstructionStage.PIPELINE_CLOSE;
|
||||||
|
@ -524,7 +647,7 @@ class DataStreamer extends Daemon {
|
||||||
// effect. Pipeline recovery can handle only one node error at a
|
// effect. Pipeline recovery can handle only one node error at a
|
||||||
// time. If the primary node fails again during the recovery, it
|
// time. If the primary node fails again during the recovery, it
|
||||||
// will be taken out then.
|
// will be taken out then.
|
||||||
tryMarkPrimaryDatanodeFailed();
|
errorState.markFirstNodeIfNotMarked();
|
||||||
throw e;
|
throw e;
|
||||||
} finally {
|
} finally {
|
||||||
writeScope.close();
|
writeScope.close();
|
||||||
|
@ -537,7 +660,7 @@ class DataStreamer extends Daemon {
|
||||||
bytesSent = tmpBytesSent;
|
bytesSent = tmpBytesSent;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (streamerClosed || hasError || !dfsClient.clientRunning) {
|
if (shouldStop()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -545,12 +668,11 @@ class DataStreamer extends Daemon {
|
||||||
if (one.isLastPacketInBlock()) {
|
if (one.isLastPacketInBlock()) {
|
||||||
// wait for the close packet has been acked
|
// wait for the close packet has been acked
|
||||||
synchronized (dataQueue) {
|
synchronized (dataQueue) {
|
||||||
while (!streamerClosed && !hasError &&
|
while (!shouldStop() && ackQueue.size() != 0) {
|
||||||
ackQueue.size() != 0 && dfsClient.clientRunning) {
|
|
||||||
dataQueue.wait(1000);// wait for acks to arrive from datanodes
|
dataQueue.wait(1000);// wait for acks to arrive from datanodes
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (streamerClosed || hasError || !dfsClient.clientRunning) {
|
if (shouldStop()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -564,7 +686,7 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
// Log warning if there was a real error.
|
// Log warning if there was a real error.
|
||||||
if (restartingNodeIndex.get() == -1) {
|
if (!errorState.isRestartingNode()) {
|
||||||
// Since their messages are descriptive enough, do not always
|
// Since their messages are descriptive enough, do not always
|
||||||
// log a verbose stack-trace WARN for quota exceptions.
|
// log a verbose stack-trace WARN for quota exceptions.
|
||||||
if (e instanceof QuotaExceededException) {
|
if (e instanceof QuotaExceededException) {
|
||||||
|
@ -575,8 +697,8 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
lastException.set(e);
|
lastException.set(e);
|
||||||
assert !(e instanceof NullPointerException);
|
assert !(e instanceof NullPointerException);
|
||||||
hasError = true;
|
errorState.setError(true);
|
||||||
if (errorIndex == -1 && restartingNodeIndex.get() == -1) {
|
if (!errorState.isNodeMarked()) {
|
||||||
// Not a datanode issue
|
// Not a datanode issue
|
||||||
streamerClosed = true;
|
streamerClosed = true;
|
||||||
}
|
}
|
||||||
|
@ -773,40 +895,6 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following synchronized methods are used whenever
|
|
||||||
// errorIndex or restartingNodeIndex is set. This is because
|
|
||||||
// check & set needs to be atomic. Simply reading variables
|
|
||||||
// does not require a synchronization. When responder is
|
|
||||||
// not running (e.g. during pipeline recovery), there is no
|
|
||||||
// need to use these methods.
|
|
||||||
|
|
||||||
/** Set the error node index. Called by responder */
|
|
||||||
synchronized void setErrorIndex(int idx) {
|
|
||||||
errorIndex = idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Set the restarting node index. Called by responder */
|
|
||||||
synchronized void setRestartingNodeIndex(int idx) {
|
|
||||||
restartingNodeIndex.set(idx);
|
|
||||||
// If the data streamer has already set the primary node
|
|
||||||
// bad, clear it. It is likely that the write failed due to
|
|
||||||
// the DN shutdown. Even if it was a real failure, the pipeline
|
|
||||||
// recovery will take care of it.
|
|
||||||
errorIndex = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method is used when no explicit error report was received,
|
|
||||||
* but something failed. When the primary node is a suspect or
|
|
||||||
* unsure about the cause, the primary node is marked as failed.
|
|
||||||
*/
|
|
||||||
synchronized void tryMarkPrimaryDatanodeFailed() {
|
|
||||||
// There should be no existing error and no ongoing restart.
|
|
||||||
if ((errorIndex == -1) && (restartingNodeIndex.get() == -1)) {
|
|
||||||
errorIndex = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Examine whether it is worth waiting for a node to restart.
|
* Examine whether it is worth waiting for a node to restart.
|
||||||
* @param index the node index
|
* @param index the node index
|
||||||
|
@ -883,20 +971,16 @@ class DataStreamer extends Daemon {
|
||||||
// the local node or the only one in the pipeline.
|
// the local node or the only one in the pipeline.
|
||||||
if (PipelineAck.isRestartOOBStatus(reply) &&
|
if (PipelineAck.isRestartOOBStatus(reply) &&
|
||||||
shouldWaitForRestart(i)) {
|
shouldWaitForRestart(i)) {
|
||||||
restartDeadline = dfsClient.getConf().getDatanodeRestartTimeout()
|
final String message = "Datanode " + i + " is restarting: "
|
||||||
+ Time.monotonicNow();
|
+ targets[i];
|
||||||
setRestartingNodeIndex(i);
|
errorState.initRestartingNode(i, message);
|
||||||
String message = "A datanode is restarting: " + targets[i];
|
|
||||||
LOG.info(message);
|
|
||||||
throw new IOException(message);
|
throw new IOException(message);
|
||||||
}
|
}
|
||||||
// node error
|
// node error
|
||||||
if (reply != SUCCESS) {
|
if (reply != SUCCESS) {
|
||||||
setErrorIndex(i); // first bad datanode
|
errorState.setBadNodeIndex(i); // mark bad datanode
|
||||||
throw new IOException("Bad response " + reply +
|
throw new IOException("Bad response " + reply +
|
||||||
" for block " + block +
|
" for " + block + " from datanode " + targets[i]);
|
||||||
" from datanode " +
|
|
||||||
targets[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -954,14 +1038,12 @@ class DataStreamer extends Daemon {
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
if (!responderClosed) {
|
if (!responderClosed) {
|
||||||
lastException.set(e);
|
lastException.set(e);
|
||||||
hasError = true;
|
errorState.setError(true);
|
||||||
// If no explicit error report was received, mark the primary
|
errorState.markFirstNodeIfNotMarked();
|
||||||
// node as failed.
|
|
||||||
tryMarkPrimaryDatanodeFailed();
|
|
||||||
synchronized (dataQueue) {
|
synchronized (dataQueue) {
|
||||||
dataQueue.notifyAll();
|
dataQueue.notifyAll();
|
||||||
}
|
}
|
||||||
if (restartingNodeIndex.get() == -1) {
|
if (!errorState.isRestartingNode()) {
|
||||||
LOG.warn("Exception for " + block, e);
|
LOG.warn("Exception for " + block, e);
|
||||||
}
|
}
|
||||||
responderClosed = true;
|
responderClosed = true;
|
||||||
|
@ -978,11 +1060,16 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this stream has encountered any errors so far, shutdown
|
/**
|
||||||
// threads and mark stream as closed. Returns true if we should
|
* If this stream has encountered any errors, shutdown threads
|
||||||
// sleep for a while after returning from this call.
|
* and mark the stream as closed.
|
||||||
//
|
*
|
||||||
|
* @return true if it should sleep for a while after returning.
|
||||||
|
*/
|
||||||
private boolean processDatanodeError() throws IOException {
|
private boolean processDatanodeError() throws IOException {
|
||||||
|
if (!errorState.hasDatanodeError()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (response != null) {
|
if (response != null) {
|
||||||
LOG.info("Error Recovery for " + block +
|
LOG.info("Error Recovery for " + block +
|
||||||
" waiting for responder to exit. ");
|
" waiting for responder to exit. ");
|
||||||
|
@ -1064,7 +1151,7 @@ class DataStreamer extends Daemon {
|
||||||
.append("The current failed datanode replacement policy is ")
|
.append("The current failed datanode replacement policy is ")
|
||||||
.append(dfsClient.dtpReplaceDatanodeOnFailure).append(", and ")
|
.append(dfsClient.dtpReplaceDatanodeOnFailure).append(", and ")
|
||||||
.append("a client may configure this via '")
|
.append("a client may configure this via '")
|
||||||
.append(HdfsClientConfigKeys.BlockWrite.ReplaceDatanodeOnFailure.POLICY_KEY)
|
.append(BlockWrite.ReplaceDatanodeOnFailure.POLICY_KEY)
|
||||||
.append("' in its configuration.")
|
.append("' in its configuration.")
|
||||||
.toString());
|
.toString());
|
||||||
}
|
}
|
||||||
|
@ -1190,82 +1277,98 @@ class DataStreamer extends Daemon {
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
long newGS = 0L;
|
long newGS = 0L;
|
||||||
while (!success && !streamerClosed && dfsClient.clientRunning) {
|
while (!success && !streamerClosed && dfsClient.clientRunning) {
|
||||||
// Sleep before reconnect if a dn is restarting.
|
if (!handleRestartingDatanode()) {
|
||||||
// This process will be repeated until the deadline or the datanode
|
return false;
|
||||||
// starts back up.
|
}
|
||||||
if (restartingNodeIndex.get() >= 0) {
|
|
||||||
|
final boolean isRecovery = errorState.hasError();
|
||||||
|
if (!handleBadDatanode()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
handleDatanodeReplacement();
|
||||||
|
|
||||||
|
// get a new generation stamp and an access token
|
||||||
|
final LocatedBlock lb = updateBlockForPipeline();
|
||||||
|
newGS = lb.getBlock().getGenerationStamp();
|
||||||
|
accessToken = lb.getBlockToken();
|
||||||
|
|
||||||
|
// set up the pipeline again with the remaining nodes
|
||||||
|
success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
|
||||||
|
|
||||||
|
failPacket4Testing();
|
||||||
|
|
||||||
|
errorState.checkRestartingNodeDeadline(nodes);
|
||||||
|
} // while
|
||||||
|
|
||||||
|
if (success) {
|
||||||
|
block = updatePipeline(newGS);
|
||||||
|
}
|
||||||
|
return false; // do not sleep, continue processing
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sleep if a node is restarting.
|
||||||
|
* This process is repeated until the deadline or the node starts back up.
|
||||||
|
* @return true if it should continue.
|
||||||
|
*/
|
||||||
|
private boolean handleRestartingDatanode() {
|
||||||
|
if (errorState.isRestartingNode()) {
|
||||||
// 4 seconds or the configured deadline period, whichever is shorter.
|
// 4 seconds or the configured deadline period, whichever is shorter.
|
||||||
// This is the retry interval and recovery will be retried in this
|
// This is the retry interval and recovery will be retried in this
|
||||||
// interval until timeout or success.
|
// interval until timeout or success.
|
||||||
long delay = Math.min(dfsClient.getConf().getDatanodeRestartTimeout(),
|
final long delay = Math.min(errorState.datanodeRestartTimeout, 4000L);
|
||||||
4000L);
|
|
||||||
try {
|
try {
|
||||||
Thread.sleep(delay);
|
Thread.sleep(delay);
|
||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
lastException.set(new IOException("Interrupted while waiting for " +
|
lastException.set(new IOException(
|
||||||
"datanode to restart. " + nodes[restartingNodeIndex.get()]));
|
"Interrupted while waiting for restarting "
|
||||||
|
+ nodes[errorState.getRestartingNodeIndex()]));
|
||||||
streamerClosed = true;
|
streamerClosed = true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
boolean isRecovery = hasError;
|
return true;
|
||||||
// remove bad datanode from list of datanodes.
|
|
||||||
// If errorIndex was not set (i.e. appends), then do not remove
|
|
||||||
// any datanodes
|
|
||||||
//
|
|
||||||
if (errorIndex >= 0) {
|
|
||||||
StringBuilder pipelineMsg = new StringBuilder();
|
|
||||||
for (int j = 0; j < nodes.length; j++) {
|
|
||||||
pipelineMsg.append(nodes[j]);
|
|
||||||
if (j < nodes.length - 1) {
|
|
||||||
pipelineMsg.append(", ");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove bad node from list of nodes if badNodeIndex was set.
|
||||||
|
* @return true if it should continue.
|
||||||
|
*/
|
||||||
|
private boolean handleBadDatanode() {
|
||||||
|
final int badNodeIndex = errorState.getBadNodeIndex();
|
||||||
|
if (badNodeIndex >= 0) {
|
||||||
if (nodes.length <= 1) {
|
if (nodes.length <= 1) {
|
||||||
lastException.set(new IOException("All datanodes " + pipelineMsg
|
lastException.set(new IOException("All datanodes "
|
||||||
+ " are bad. Aborting..."));
|
+ Arrays.toString(nodes) + " are bad. Aborting..."));
|
||||||
streamerClosed = true;
|
streamerClosed = true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOG.warn("Error Recovery for block " + block +
|
|
||||||
" in pipeline " + pipelineMsg +
|
LOG.warn("Error Recovery for " + block + " in pipeline "
|
||||||
": bad datanode " + nodes[errorIndex]);
|
+ Arrays.toString(nodes) + ": datanode " + badNodeIndex
|
||||||
failed.add(nodes[errorIndex]);
|
+ "("+ nodes[badNodeIndex] + ") is bad.");
|
||||||
|
failed.add(nodes[badNodeIndex]);
|
||||||
|
|
||||||
DatanodeInfo[] newnodes = new DatanodeInfo[nodes.length-1];
|
DatanodeInfo[] newnodes = new DatanodeInfo[nodes.length-1];
|
||||||
arraycopy(nodes, newnodes, errorIndex);
|
arraycopy(nodes, newnodes, badNodeIndex);
|
||||||
|
|
||||||
final StorageType[] newStorageTypes = new StorageType[newnodes.length];
|
final StorageType[] newStorageTypes = new StorageType[newnodes.length];
|
||||||
arraycopy(storageTypes, newStorageTypes, errorIndex);
|
arraycopy(storageTypes, newStorageTypes, badNodeIndex);
|
||||||
|
|
||||||
final String[] newStorageIDs = new String[newnodes.length];
|
final String[] newStorageIDs = new String[newnodes.length];
|
||||||
arraycopy(storageIDs, newStorageIDs, errorIndex);
|
arraycopy(storageIDs, newStorageIDs, badNodeIndex);
|
||||||
|
|
||||||
setPipeline(newnodes, newStorageTypes, newStorageIDs);
|
setPipeline(newnodes, newStorageTypes, newStorageIDs);
|
||||||
|
|
||||||
// Just took care of a node error while waiting for a node restart
|
errorState.adjustState4RestartingNode();
|
||||||
if (restartingNodeIndex.get() >= 0) {
|
|
||||||
// If the error came from a node further away than the restarting
|
|
||||||
// node, the restart must have been complete.
|
|
||||||
if (errorIndex > restartingNodeIndex.get()) {
|
|
||||||
restartingNodeIndex.set(-1);
|
|
||||||
} else if (errorIndex < restartingNodeIndex.get()) {
|
|
||||||
// the node index has shifted.
|
|
||||||
restartingNodeIndex.decrementAndGet();
|
|
||||||
} else {
|
|
||||||
// this shouldn't happen...
|
|
||||||
assert false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (restartingNodeIndex.get() == -1) {
|
|
||||||
hasError = false;
|
|
||||||
}
|
|
||||||
lastException.clear();
|
lastException.clear();
|
||||||
errorIndex = -1;
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if replace-datanode policy is satisfied.
|
/** Add a datanode if replace-datanode policy is satisfied. */
|
||||||
|
private void handleDatanodeReplacement() throws IOException {
|
||||||
if (dfsClient.dtpReplaceDatanodeOnFailure.satisfy(stat.getReplication(),
|
if (dfsClient.dtpReplaceDatanodeOnFailure.satisfy(stat.getReplication(),
|
||||||
nodes, isAppend, isHflushed)) {
|
nodes, isAppend, isHflushed)) {
|
||||||
try {
|
try {
|
||||||
|
@ -1276,19 +1379,14 @@ class DataStreamer extends Daemon {
|
||||||
}
|
}
|
||||||
LOG.warn("Failed to replace datanode."
|
LOG.warn("Failed to replace datanode."
|
||||||
+ " Continue with the remaining datanodes since "
|
+ " Continue with the remaining datanodes since "
|
||||||
+ HdfsClientConfigKeys.BlockWrite.ReplaceDatanodeOnFailure.BEST_EFFORT_KEY
|
+ BlockWrite.ReplaceDatanodeOnFailure.BEST_EFFORT_KEY
|
||||||
+ " is set to true.", ioe);
|
+ " is set to true.", ioe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// get a new generation stamp and an access token
|
private void failPacket4Testing() {
|
||||||
LocatedBlock lb = dfsClient.namenode.updateBlockForPipeline(block, dfsClient.clientName);
|
|
||||||
newGS = lb.getBlock().getGenerationStamp();
|
|
||||||
accessToken = lb.getBlockToken();
|
|
||||||
|
|
||||||
// set up the pipeline again with the remaining nodes
|
|
||||||
if (failPacket) { // for testing
|
if (failPacket) { // for testing
|
||||||
success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
|
|
||||||
failPacket = false;
|
failPacket = false;
|
||||||
try {
|
try {
|
||||||
// Give DNs time to send in bad reports. In real situations,
|
// Give DNs time to send in bad reports. In real situations,
|
||||||
|
@ -1296,48 +1394,21 @@ class DataStreamer extends Daemon {
|
||||||
// with those nodes.
|
// with those nodes.
|
||||||
Thread.sleep(2000);
|
Thread.sleep(2000);
|
||||||
} catch (InterruptedException ie) {}
|
} catch (InterruptedException ie) {}
|
||||||
} else {
|
}
|
||||||
success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (restartingNodeIndex.get() >= 0) {
|
LocatedBlock updateBlockForPipeline() throws IOException {
|
||||||
assert hasError == true;
|
return dfsClient.namenode.updateBlockForPipeline(
|
||||||
// check errorIndex set above
|
block, dfsClient.clientName);
|
||||||
if (errorIndex == restartingNodeIndex.get()) {
|
|
||||||
// ignore, if came from the restarting node
|
|
||||||
errorIndex = -1;
|
|
||||||
}
|
}
|
||||||
// still within the deadline
|
|
||||||
if (Time.monotonicNow() < restartDeadline) {
|
|
||||||
continue; // with in the deadline
|
|
||||||
}
|
|
||||||
// expired. declare the restarting node dead
|
|
||||||
restartDeadline = 0;
|
|
||||||
int expiredNodeIndex = restartingNodeIndex.get();
|
|
||||||
restartingNodeIndex.set(-1);
|
|
||||||
LOG.warn("Datanode did not restart in time: " +
|
|
||||||
nodes[expiredNodeIndex]);
|
|
||||||
// Mark the restarting node as failed. If there is any other failed
|
|
||||||
// node during the last pipeline construction attempt, it will not be
|
|
||||||
// overwritten/dropped. In this case, the restarting node will get
|
|
||||||
// excluded in the following attempt, if it still does not come up.
|
|
||||||
if (errorIndex == -1) {
|
|
||||||
errorIndex = expiredNodeIndex;
|
|
||||||
}
|
|
||||||
// From this point on, normal pipeline recovery applies.
|
|
||||||
}
|
|
||||||
} // while
|
|
||||||
|
|
||||||
if (success) {
|
/** update pipeline at the namenode */
|
||||||
// update pipeline at the namenode
|
ExtendedBlock updatePipeline(long newGS) throws IOException {
|
||||||
ExtendedBlock newBlock = new ExtendedBlock(
|
final ExtendedBlock newBlock = new ExtendedBlock(
|
||||||
block.getBlockPoolId(), block.getBlockId(), block.getNumBytes(), newGS);
|
block.getBlockPoolId(), block.getBlockId(), block.getNumBytes(), newGS);
|
||||||
dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock,
|
dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock,
|
||||||
nodes, storageIDs);
|
nodes, storageIDs);
|
||||||
// update client side generation stamp
|
return newBlock;
|
||||||
block = newBlock;
|
|
||||||
}
|
|
||||||
return false; // do not sleep, continue processing
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1354,9 +1425,8 @@ class DataStreamer extends Daemon {
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
ExtendedBlock oldBlock = block;
|
ExtendedBlock oldBlock = block;
|
||||||
do {
|
do {
|
||||||
hasError = false;
|
errorState.reset();
|
||||||
lastException.clear();
|
lastException.clear();
|
||||||
errorIndex = -1;
|
|
||||||
success = false;
|
success = false;
|
||||||
|
|
||||||
DatanodeInfo[] excluded =
|
DatanodeInfo[] excluded =
|
||||||
|
@ -1382,8 +1452,9 @@ class DataStreamer extends Daemon {
|
||||||
dfsClient.namenode.abandonBlock(block, stat.getFileId(), src,
|
dfsClient.namenode.abandonBlock(block, stat.getFileId(), src,
|
||||||
dfsClient.clientName);
|
dfsClient.clientName);
|
||||||
block = null;
|
block = null;
|
||||||
LOG.info("Excluding datanode " + nodes[errorIndex]);
|
final DatanodeInfo badNode = nodes[errorState.getBadNodeIndex()];
|
||||||
excludedNodes.put(nodes[errorIndex], nodes[errorIndex]);
|
LOG.info("Excluding datanode " + badNode);
|
||||||
|
excludedNodes.put(badNode, badNode);
|
||||||
}
|
}
|
||||||
} while (!success && --count >= 0);
|
} while (!success && --count >= 0);
|
||||||
|
|
||||||
|
@ -1464,7 +1535,7 @@ class DataStreamer extends Daemon {
|
||||||
// from the local datanode. Thus it is safe to treat this as a
|
// from the local datanode. Thus it is safe to treat this as a
|
||||||
// regular node error.
|
// regular node error.
|
||||||
if (PipelineAck.isRestartOOBStatus(pipelineStatus) &&
|
if (PipelineAck.isRestartOOBStatus(pipelineStatus) &&
|
||||||
restartingNodeIndex.get() == -1) {
|
!errorState.isRestartingNode()) {
|
||||||
checkRestart = true;
|
checkRestart = true;
|
||||||
throw new IOException("A datanode is restarting.");
|
throw new IOException("A datanode is restarting.");
|
||||||
}
|
}
|
||||||
|
@ -1475,10 +1546,9 @@ class DataStreamer extends Daemon {
|
||||||
assert null == blockStream : "Previous blockStream unclosed";
|
assert null == blockStream : "Previous blockStream unclosed";
|
||||||
blockStream = out;
|
blockStream = out;
|
||||||
result = true; // success
|
result = true; // success
|
||||||
restartingNodeIndex.set(-1);
|
errorState.reset();
|
||||||
hasError = false;
|
|
||||||
} catch (IOException ie) {
|
} catch (IOException ie) {
|
||||||
if (restartingNodeIndex.get() == -1) {
|
if (!errorState.isRestartingNode()) {
|
||||||
LOG.info("Exception in createBlockOutputStream", ie);
|
LOG.info("Exception in createBlockOutputStream", ie);
|
||||||
}
|
}
|
||||||
if (ie instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
|
if (ie instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
|
||||||
|
@ -1498,24 +1568,21 @@ class DataStreamer extends Daemon {
|
||||||
for (int i = 0; i < nodes.length; i++) {
|
for (int i = 0; i < nodes.length; i++) {
|
||||||
// NB: Unconditionally using the xfer addr w/o hostname
|
// NB: Unconditionally using the xfer addr w/o hostname
|
||||||
if (firstBadLink.equals(nodes[i].getXferAddr())) {
|
if (firstBadLink.equals(nodes[i].getXferAddr())) {
|
||||||
errorIndex = i;
|
errorState.setBadNodeIndex(i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
assert checkRestart == false;
|
assert checkRestart == false;
|
||||||
errorIndex = 0;
|
errorState.setBadNodeIndex(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int i = errorState.getBadNodeIndex();
|
||||||
// Check whether there is a restart worth waiting for.
|
// Check whether there is a restart worth waiting for.
|
||||||
if (checkRestart && shouldWaitForRestart(errorIndex)) {
|
if (checkRestart && shouldWaitForRestart(i)) {
|
||||||
restartDeadline = dfsClient.getConf().getDatanodeRestartTimeout()
|
errorState.initRestartingNode(i, "Datanode " + i + " is restarting: " + nodes[i]);
|
||||||
+ Time.monotonicNow();
|
|
||||||
restartingNodeIndex.set(errorIndex);
|
|
||||||
errorIndex = -1;
|
|
||||||
LOG.info("Waiting for the datanode to be restarted: " +
|
|
||||||
nodes[restartingNodeIndex.get()]);
|
|
||||||
}
|
}
|
||||||
hasError = true;
|
errorState.setError(true);
|
||||||
lastException.set(ie);
|
lastException.set(ie);
|
||||||
result = false; // error
|
result = false; // error
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -1699,10 +1766,10 @@ class DataStreamer extends Daemon {
|
||||||
return new DFSPacket(buf, 0, 0, DFSPacket.HEART_BEAT_SEQNO, 0, false);
|
return new DFSPacket(buf, 0, 0, DFSPacket.HEART_BEAT_SEQNO, 0, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private LoadingCache<DatanodeInfo, DatanodeInfo> initExcludedNodes() {
|
private static LoadingCache<DatanodeInfo, DatanodeInfo> initExcludedNodes(
|
||||||
return CacheBuilder.newBuilder().expireAfterWrite(
|
long excludedNodesCacheExpiry) {
|
||||||
dfsClient.getConf().getExcludedNodesCacheExpiry(),
|
return CacheBuilder.newBuilder()
|
||||||
TimeUnit.MILLISECONDS)
|
.expireAfterWrite(excludedNodesCacheExpiry, TimeUnit.MILLISECONDS)
|
||||||
.removalListener(new RemovalListener<DatanodeInfo, DatanodeInfo>() {
|
.removalListener(new RemovalListener<DatanodeInfo, DatanodeInfo>() {
|
||||||
@Override
|
@Override
|
||||||
public void onRemoval(
|
public void onRemoval(
|
||||||
|
|
Loading…
Reference in New Issue