HDFS-9236. Missing sanity check for block size during block recovery. (Tony Wu via Yongjun Zhang)
This commit is contained in:
parent
0b18e5e8c6
commit
b64242c0d2
|
@ -1629,6 +1629,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
HDFS-9377. Fix findbugs warnings in FSDirSnapshotOp.
|
HDFS-9377. Fix findbugs warnings in FSDirSnapshotOp.
|
||||||
(Mingliang Liu via Yongjun Zhang)
|
(Mingliang Liu via Yongjun Zhang)
|
||||||
|
|
||||||
|
HDFS-9236. Missing sanity check for block size during block recovery.
|
||||||
|
(Tony Wu via Yongjun Zhang)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
||||||
|
|
|
@ -103,8 +103,13 @@ public class BlockRecoveryWorker {
|
||||||
protected void recover() throws IOException {
|
protected void recover() throws IOException {
|
||||||
List<BlockRecord> syncList = new ArrayList<>(locs.length);
|
List<BlockRecord> syncList = new ArrayList<>(locs.length);
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
int candidateReplicaCnt = 0;
|
||||||
|
|
||||||
//check generation stamps
|
// Check generation stamps, replica size and state. Replica must satisfy
|
||||||
|
// the following criteria to be included in syncList for recovery:
|
||||||
|
// - Valid generation stamp
|
||||||
|
// - Non-zero length
|
||||||
|
// - Original state is RWR or better
|
||||||
for(DatanodeID id : locs) {
|
for(DatanodeID id : locs) {
|
||||||
try {
|
try {
|
||||||
DatanodeID bpReg =datanode.getBPOfferService(bpid).bpRegistration;
|
DatanodeID bpReg =datanode.getBPOfferService(bpid).bpRegistration;
|
||||||
|
@ -115,7 +120,28 @@ public class BlockRecoveryWorker {
|
||||||
if (info != null &&
|
if (info != null &&
|
||||||
info.getGenerationStamp() >= block.getGenerationStamp() &&
|
info.getGenerationStamp() >= block.getGenerationStamp() &&
|
||||||
info.getNumBytes() > 0) {
|
info.getNumBytes() > 0) {
|
||||||
|
// Count the number of candidate replicas received.
|
||||||
|
++candidateReplicaCnt;
|
||||||
|
if (info.getOriginalReplicaState().getValue() <=
|
||||||
|
ReplicaState.RWR.getValue()) {
|
||||||
syncList.add(new BlockRecord(id, proxyDN, info));
|
syncList.add(new BlockRecord(id, proxyDN, info));
|
||||||
|
} else {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Block recovery: Ignored replica with invalid " +
|
||||||
|
"original state: " + info + " from DataNode: " + id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
if (info == null) {
|
||||||
|
LOG.debug("Block recovery: DataNode: " + id + " does not have "
|
||||||
|
+ "replica for block: " + block);
|
||||||
|
} else {
|
||||||
|
LOG.debug("Block recovery: Ignored replica with invalid "
|
||||||
|
+ "generation stamp or length: " + info + " from " +
|
||||||
|
"DataNode: " + id);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (RecoveryInProgressException ripE) {
|
} catch (RecoveryInProgressException ripE) {
|
||||||
InterDatanodeProtocol.LOG.warn(
|
InterDatanodeProtocol.LOG.warn(
|
||||||
|
@ -136,6 +162,15 @@ public class BlockRecoveryWorker {
|
||||||
+ ", datanodeids=" + Arrays.asList(locs));
|
+ ", datanodeids=" + Arrays.asList(locs));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// None of the replicas reported by DataNodes has the required original
|
||||||
|
// state, report the error.
|
||||||
|
if (candidateReplicaCnt > 0 && syncList.isEmpty()) {
|
||||||
|
throw new IOException("Found " + candidateReplicaCnt +
|
||||||
|
" replica(s) for block " + block + " but none is in " +
|
||||||
|
ReplicaState.RWR.name() + " or better state. datanodeids=" +
|
||||||
|
Arrays.asList(locs));
|
||||||
|
}
|
||||||
|
|
||||||
syncBlock(syncList);
|
syncBlock(syncList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -157,6 +192,11 @@ public class BlockRecoveryWorker {
|
||||||
// or their replicas have 0 length.
|
// or their replicas have 0 length.
|
||||||
// The block can be deleted.
|
// The block can be deleted.
|
||||||
if (syncList.isEmpty()) {
|
if (syncList.isEmpty()) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("syncBlock for block " + block + ", all datanodes don't " +
|
||||||
|
"have the block or their replicas have 0 length. The block can " +
|
||||||
|
"be deleted.");
|
||||||
|
}
|
||||||
nn.commitBlockSynchronization(block, recoveryId, 0,
|
nn.commitBlockSynchronization(block, recoveryId, 0,
|
||||||
true, true, DatanodeID.EMPTY_ARRAY, null);
|
true, true, DatanodeID.EMPTY_ARRAY, null);
|
||||||
return;
|
return;
|
||||||
|
@ -195,6 +235,12 @@ public class BlockRecoveryWorker {
|
||||||
r.rInfo.getNumBytes() == finalizedLength) {
|
r.rInfo.getNumBytes() == finalizedLength) {
|
||||||
participatingList.add(r);
|
participatingList.add(r);
|
||||||
}
|
}
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("syncBlock replicaInfo: block=" + block +
|
||||||
|
", from datanode " + r.id + ", receivedState=" + rState.name() +
|
||||||
|
", receivedLength=" + r.rInfo.getNumBytes() +
|
||||||
|
", bestState=FINALIZED, finalizedLength=" + finalizedLength);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
newBlock.setNumBytes(finalizedLength);
|
newBlock.setNumBytes(finalizedLength);
|
||||||
break;
|
break;
|
||||||
|
@ -207,7 +253,16 @@ public class BlockRecoveryWorker {
|
||||||
minLength = Math.min(minLength, r.rInfo.getNumBytes());
|
minLength = Math.min(minLength, r.rInfo.getNumBytes());
|
||||||
participatingList.add(r);
|
participatingList.add(r);
|
||||||
}
|
}
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("syncBlock replicaInfo: block=" + block +
|
||||||
|
", from datanode " + r.id + ", receivedState=" + rState.name() +
|
||||||
|
", receivedLength=" + r.rInfo.getNumBytes() + ", bestState=" +
|
||||||
|
bestState.name());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
// recover() guarantees syncList will have at least one replica with RWR
|
||||||
|
// or better state.
|
||||||
|
assert minLength != Long.MAX_VALUE : "wrong minLength";
|
||||||
newBlock.setNumBytes(minLength);
|
newBlock.setNumBytes(minLength);
|
||||||
break;
|
break;
|
||||||
case RUR:
|
case RUR:
|
||||||
|
@ -254,6 +309,13 @@ public class BlockRecoveryWorker {
|
||||||
datanodes[i] = r.id;
|
datanodes[i] = r.id;
|
||||||
storages[i] = r.storageID;
|
storages[i] = r.storageID;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Datanode triggering commitBlockSynchronization, block=" +
|
||||||
|
block + ", newGs=" + newBlock.getGenerationStamp() +
|
||||||
|
", newLength=" + newBlock.getNumBytes());
|
||||||
|
}
|
||||||
|
|
||||||
nn.commitBlockSynchronization(block,
|
nn.commitBlockSynchronization(block,
|
||||||
newBlock.getGenerationStamp(), newBlock.getNumBytes(), true, false,
|
newBlock.getGenerationStamp(), newBlock.getNumBytes(), true, false,
|
||||||
datanodes, storages);
|
datanodes, storages);
|
||||||
|
|
|
@ -49,4 +49,10 @@ public class ReplicaRecoveryInfo extends Block {
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return super.hashCode();
|
return super.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return super.toString() + "[numBytes=" + this.getNumBytes() +
|
||||||
|
",originalReplicaState=" + this.originalState.name() + "]";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
package org.apache.hadoop.hdfs.server.datanode;
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
import static org.mockito.Matchers.any;
|
import static org.mockito.Matchers.any;
|
||||||
import static org.mockito.Matchers.anyBoolean;
|
import static org.mockito.Matchers.anyBoolean;
|
||||||
|
@ -680,4 +681,40 @@ public class TestBlockRecovery {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DNs report RUR instead of RBW, RWR or FINALIZED. Primary DN expected to
|
||||||
|
* throw an exception.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRURReplicas() throws Exception {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Running " + GenericTestUtils.getMethodName());
|
||||||
|
}
|
||||||
|
|
||||||
|
doReturn(new ReplicaRecoveryInfo(block.getBlockId(), block.getNumBytes(),
|
||||||
|
block.getGenerationStamp(), ReplicaState.RUR)).when(spyDN).
|
||||||
|
initReplicaRecovery(any(RecoveringBlock.class));
|
||||||
|
|
||||||
|
boolean exceptionThrown = false;
|
||||||
|
try {
|
||||||
|
for (RecoveringBlock rBlock : initRecoveringBlocks()) {
|
||||||
|
BlockRecoveryWorker.RecoveryTaskContiguous RecoveryTaskContiguous =
|
||||||
|
recoveryWorker.new RecoveryTaskContiguous(rBlock);
|
||||||
|
BlockRecoveryWorker.RecoveryTaskContiguous spyTask =
|
||||||
|
spy(RecoveryTaskContiguous);
|
||||||
|
spyTask.recover();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// expect IOException to be thrown here
|
||||||
|
e.printStackTrace();
|
||||||
|
assertTrue("Wrong exception was thrown: " + e.getMessage(),
|
||||||
|
e.getMessage().contains("Found 1 replica(s) for block " + block +
|
||||||
|
" but none is in RWR or better state"));
|
||||||
|
exceptionThrown = true;
|
||||||
|
} finally {
|
||||||
|
assertTrue(exceptionThrown);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue