HDFS-10720. Fix intermittent test failure of TestDataNodeErasureCodingMetrics. Contributed by Rakesh R

2016-08-12 11:15:05 +08:00 · 2016-08-12 11:15:05 +08:00 · e7e8aed208
parent aea3e65749
commit e7e8aed208
1 changed files with 37 additions and 9 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeErasureCodingMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeErasureCodingMetrics.java
@ -39,12 +39,15 @@ import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
 import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
 import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 import java.io.IOException;
 import java.util.Arrays;
 /**
@ -125,6 +128,7 @@ public class TestDataNodeErasureCodingMetrics {
    DataNode workerDn = null;
    DatanodeInfo[] locations = lastBlock.getLocations();
    assertEquals(locations.length, GROUPSIZE);
    // we have ONE extra datanode in addition to the GROUPSIZE datanodes, here
    // is to find the extra datanode that the reconstruction task will run on,
    // according to the current block placement logic for striped files.
@ -143,21 +147,45 @@ public class TestDataNodeErasureCodingMetrics {
        break;
      }
    }
-    byte[] indices = lastBlock.getBlockIndices();
+    // Get a datanode from the block locations.
-    //corrupt the first block
+    LOG.info("Block locations: " + Arrays.asList(locations));
-    DataNode toCorruptDn = cluster.getDataNodes().get(indices[0]);
+    LOG.info("Erasure coding worker datanode: " + workerDn);
    assertNotNull("Failed to find a worker datanode", workerDn);
    DataNode toCorruptDn = cluster.getDataNode(locations[0].getIpcPort());
    LOG.info("Datanode to be corrupted: " + toCorruptDn);
    assertNotNull("Failed to find a datanode to be corrupted", toCorruptDn);
    toCorruptDn.shutdown();
    setDataNodeDead(toCorruptDn.getDatanodeId());
    DFSTestUtil.waitForDatanodeState(cluster, toCorruptDn.getDatanodeUuid(),
        false, 10000);
-    final BlockManager bm = cluster.getNamesystem().getBlockManager();
+
-    BlockManagerTestUtil.getComputedDatanodeWork(bm);
+    int workCount = getComputedDatanodeWork();
    assertTrue("Wrongly computed block reconstruction work", workCount > 0);
    cluster.triggerHeartbeats();
    StripedFileTestUtil.waitForReconstructionFinished(file, fs, GROUPSIZE);
    return workerDn;
  }
  private int getComputedDatanodeWork()
      throws IOException, InterruptedException {
    final BlockManager bm = cluster.getNamesystem().getBlockManager();
    // Giving a grace period to compute datanode work.
    int workCount = 0;
    int retries = 20;
    while (retries > 0) {
      workCount = BlockManagerTestUtil.getComputedDatanodeWork(bm);
      if (workCount > 0) {
        break;
      }
      retries--;
      Thread.sleep(500);
    }
    LOG.info("Computed datanode work: " + workCount + ", retries: " + retries);
    return workCount;
  }
  private void setDataNodeDead(DatanodeID dnID) throws IOException {
    DatanodeDescriptor dnd =
        NameNodeAdapter.getDatanode(cluster.getNamesystem(), dnID);