HDFS-9236. Missing sanity check for block size during block recovery. (Tony Wu via Yongjun Zhang)

This commit is contained in:
Yongjun Zhang 2015-11-06 11:15:54 -08:00
parent 0b18e5e8c6
commit b64242c0d2
4 changed files with 110 additions and 2 deletions

View File

@ -1629,6 +1629,9 @@ Release 2.8.0 - UNRELEASED
HDFS-9377. Fix findbugs warnings in FSDirSnapshotOp. HDFS-9377. Fix findbugs warnings in FSDirSnapshotOp.
(Mingliang Liu via Yongjun Zhang) (Mingliang Liu via Yongjun Zhang)
HDFS-9236. Missing sanity check for block size during block recovery.
(Tony Wu via Yongjun Zhang)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than

View File

@ -103,8 +103,13 @@ public class BlockRecoveryWorker {
protected void recover() throws IOException { protected void recover() throws IOException {
List<BlockRecord> syncList = new ArrayList<>(locs.length); List<BlockRecord> syncList = new ArrayList<>(locs.length);
int errorCount = 0; int errorCount = 0;
int candidateReplicaCnt = 0;
//check generation stamps // Check generation stamps, replica size and state. Replica must satisfy
// the following criteria to be included in syncList for recovery:
// - Valid generation stamp
// - Non-zero length
// - Original state is RWR or better
for(DatanodeID id : locs) { for(DatanodeID id : locs) {
try { try {
DatanodeID bpReg =datanode.getBPOfferService(bpid).bpRegistration; DatanodeID bpReg =datanode.getBPOfferService(bpid).bpRegistration;
@ -115,7 +120,28 @@ public class BlockRecoveryWorker {
if (info != null && if (info != null &&
info.getGenerationStamp() >= block.getGenerationStamp() && info.getGenerationStamp() >= block.getGenerationStamp() &&
info.getNumBytes() > 0) { info.getNumBytes() > 0) {
// Count the number of candidate replicas received.
++candidateReplicaCnt;
if (info.getOriginalReplicaState().getValue() <=
ReplicaState.RWR.getValue()) {
syncList.add(new BlockRecord(id, proxyDN, info)); syncList.add(new BlockRecord(id, proxyDN, info));
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Block recovery: Ignored replica with invalid " +
"original state: " + info + " from DataNode: " + id);
}
}
} else {
if (LOG.isDebugEnabled()) {
if (info == null) {
LOG.debug("Block recovery: DataNode: " + id + " does not have "
+ "replica for block: " + block);
} else {
LOG.debug("Block recovery: Ignored replica with invalid "
+ "generation stamp or length: " + info + " from " +
"DataNode: " + id);
}
}
} }
} catch (RecoveryInProgressException ripE) { } catch (RecoveryInProgressException ripE) {
InterDatanodeProtocol.LOG.warn( InterDatanodeProtocol.LOG.warn(
@ -136,6 +162,15 @@ public class BlockRecoveryWorker {
+ ", datanodeids=" + Arrays.asList(locs)); + ", datanodeids=" + Arrays.asList(locs));
} }
// None of the replicas reported by DataNodes has the required original
// state, report the error.
if (candidateReplicaCnt > 0 && syncList.isEmpty()) {
throw new IOException("Found " + candidateReplicaCnt +
" replica(s) for block " + block + " but none is in " +
ReplicaState.RWR.name() + " or better state. datanodeids=" +
Arrays.asList(locs));
}
syncBlock(syncList); syncBlock(syncList);
} }
@ -157,6 +192,11 @@ public class BlockRecoveryWorker {
// or their replicas have 0 length. // or their replicas have 0 length.
// The block can be deleted. // The block can be deleted.
if (syncList.isEmpty()) { if (syncList.isEmpty()) {
if (LOG.isDebugEnabled()) {
LOG.debug("syncBlock for block " + block + ", all datanodes don't " +
"have the block or their replicas have 0 length. The block can " +
"be deleted.");
}
nn.commitBlockSynchronization(block, recoveryId, 0, nn.commitBlockSynchronization(block, recoveryId, 0,
true, true, DatanodeID.EMPTY_ARRAY, null); true, true, DatanodeID.EMPTY_ARRAY, null);
return; return;
@ -195,6 +235,12 @@ public class BlockRecoveryWorker {
r.rInfo.getNumBytes() == finalizedLength) { r.rInfo.getNumBytes() == finalizedLength) {
participatingList.add(r); participatingList.add(r);
} }
if (LOG.isDebugEnabled()) {
LOG.debug("syncBlock replicaInfo: block=" + block +
", from datanode " + r.id + ", receivedState=" + rState.name() +
", receivedLength=" + r.rInfo.getNumBytes() +
", bestState=FINALIZED, finalizedLength=" + finalizedLength);
}
} }
newBlock.setNumBytes(finalizedLength); newBlock.setNumBytes(finalizedLength);
break; break;
@ -207,7 +253,16 @@ public class BlockRecoveryWorker {
minLength = Math.min(minLength, r.rInfo.getNumBytes()); minLength = Math.min(minLength, r.rInfo.getNumBytes());
participatingList.add(r); participatingList.add(r);
} }
if (LOG.isDebugEnabled()) {
LOG.debug("syncBlock replicaInfo: block=" + block +
", from datanode " + r.id + ", receivedState=" + rState.name() +
", receivedLength=" + r.rInfo.getNumBytes() + ", bestState=" +
bestState.name());
} }
}
// recover() guarantees syncList will have at least one replica with RWR
// or better state.
assert minLength != Long.MAX_VALUE : "wrong minLength";
newBlock.setNumBytes(minLength); newBlock.setNumBytes(minLength);
break; break;
case RUR: case RUR:
@ -254,6 +309,13 @@ public class BlockRecoveryWorker {
datanodes[i] = r.id; datanodes[i] = r.id;
storages[i] = r.storageID; storages[i] = r.storageID;
} }
if (LOG.isDebugEnabled()) {
LOG.debug("Datanode triggering commitBlockSynchronization, block=" +
block + ", newGs=" + newBlock.getGenerationStamp() +
", newLength=" + newBlock.getNumBytes());
}
nn.commitBlockSynchronization(block, nn.commitBlockSynchronization(block,
newBlock.getGenerationStamp(), newBlock.getNumBytes(), true, false, newBlock.getGenerationStamp(), newBlock.getNumBytes(), true, false,
datanodes, storages); datanodes, storages);

View File

@ -49,4 +49,10 @@ public class ReplicaRecoveryInfo extends Block {
public int hashCode() { public int hashCode() {
return super.hashCode(); return super.hashCode();
} }
@Override
public String toString() {
return super.toString() + "[numBytes=" + this.getNumBytes() +
",originalReplicaState=" + this.originalState.name() + "]";
}
} }

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.server.datanode; package org.apache.hadoop.hdfs.server.datanode;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
import static org.mockito.Matchers.any; import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyBoolean; import static org.mockito.Matchers.anyBoolean;
@ -680,4 +681,40 @@ public class TestBlockRecovery {
} }
} }
} }
/**
* DNs report RUR instead of RBW, RWR or FINALIZED. Primary DN expected to
* throw an exception.
* @throws Exception
*/
@Test
public void testRURReplicas() throws Exception {
if (LOG.isDebugEnabled()) {
LOG.debug("Running " + GenericTestUtils.getMethodName());
}
doReturn(new ReplicaRecoveryInfo(block.getBlockId(), block.getNumBytes(),
block.getGenerationStamp(), ReplicaState.RUR)).when(spyDN).
initReplicaRecovery(any(RecoveringBlock.class));
boolean exceptionThrown = false;
try {
for (RecoveringBlock rBlock : initRecoveringBlocks()) {
BlockRecoveryWorker.RecoveryTaskContiguous RecoveryTaskContiguous =
recoveryWorker.new RecoveryTaskContiguous(rBlock);
BlockRecoveryWorker.RecoveryTaskContiguous spyTask =
spy(RecoveryTaskContiguous);
spyTask.recover();
}
} catch (IOException e) {
// expect IOException to be thrown here
e.printStackTrace();
assertTrue("Wrong exception was thrown: " + e.getMessage(),
e.getMessage().contains("Found 1 replica(s) for block " + block +
" but none is in RWR or better state"));
exceptionThrown = true;
} finally {
assertTrue(exceptionThrown);
}
}
} }