HDFS-8449. Add tasks count metrics to datanode for ECWorker. Contributed by Bo Li.
This commit is contained in:
parent
45788204ae
commit
ad9441122f
|
@ -179,11 +179,11 @@ class StripedReconstructor implements Runnable {
|
||||||
// block replication.
|
// block replication.
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
LOG.warn("Failed to reconstruct striped block: {}", blockGroup, e);
|
LOG.warn("Failed to reconstruct striped block: {}", blockGroup, e);
|
||||||
|
datanode.getMetrics().incrECFailedReconstructionTasks();
|
||||||
} finally {
|
} finally {
|
||||||
datanode.decrementXmitsInProgress();
|
datanode.decrementXmitsInProgress();
|
||||||
|
datanode.getMetrics().incrECReconstructionTasks();
|
||||||
stripedReader.close();
|
stripedReader.close();
|
||||||
|
|
||||||
stripedWriter.close();
|
stripedWriter.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -126,6 +126,11 @@ public class DataNodeMetrics {
|
||||||
@Metric MutableRate sendDataPacketTransferNanos;
|
@Metric MutableRate sendDataPacketTransferNanos;
|
||||||
final MutableQuantiles[] sendDataPacketTransferNanosQuantiles;
|
final MutableQuantiles[] sendDataPacketTransferNanosQuantiles;
|
||||||
|
|
||||||
|
@Metric("Count of erasure coding reconstruction tasks")
|
||||||
|
MutableCounterLong ecReconstructionTasks;
|
||||||
|
@Metric("Count of erasure coding failed reconstruction tasks")
|
||||||
|
MutableCounterLong ecFailedReconstructionTasks;
|
||||||
|
|
||||||
final MetricsRegistry registry = new MetricsRegistry("datanode");
|
final MetricsRegistry registry = new MetricsRegistry("datanode");
|
||||||
final String name;
|
final String name;
|
||||||
JvmMetrics jvmMetrics = null;
|
JvmMetrics jvmMetrics = null;
|
||||||
|
@ -415,4 +420,13 @@ public class DataNodeMetrics {
|
||||||
q.add(latencyMs);
|
q.add(latencyMs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void incrECReconstructionTasks() {
|
||||||
|
ecReconstructionTasks.incr();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrECFailedReconstructionTasks() {
|
||||||
|
ecFailedReconstructionTasks.incr();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class StripedFileTestUtil {
|
||||||
static int numDNs = NUM_DATA_BLOCKS + NUM_PARITY_BLOCKS + 2;
|
static int numDNs = NUM_DATA_BLOCKS + NUM_PARITY_BLOCKS + 2;
|
||||||
static int BLOCK_GROUP_SIZE = blockSize * NUM_DATA_BLOCKS;
|
static int BLOCK_GROUP_SIZE = blockSize * NUM_DATA_BLOCKS;
|
||||||
|
|
||||||
static byte[] generateBytes(int cnt) {
|
public static byte[] generateBytes(int cnt) {
|
||||||
byte[] bytes = new byte[cnt];
|
byte[] bytes = new byte[cnt];
|
||||||
for (int i = 0; i < cnt; i++) {
|
for (int i = 0; i < cnt; i++) {
|
||||||
bytes[i] = getByte(i);
|
bytes[i] = getByte(i);
|
||||||
|
@ -502,4 +502,34 @@ public class StripedFileTestUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for the reconstruction to be finished when the file has
|
||||||
|
* corrupted blocks.
|
||||||
|
*/
|
||||||
|
public static LocatedBlocks waitForReconstructionFinished(Path file,
|
||||||
|
DistributedFileSystem fs, int groupSize)
|
||||||
|
throws Exception {
|
||||||
|
final int attempts = 60;
|
||||||
|
for (int i = 0; i < attempts; i++) {
|
||||||
|
LocatedBlocks locatedBlocks = getLocatedBlocks(file, fs);
|
||||||
|
LocatedStripedBlock lastBlock =
|
||||||
|
(LocatedStripedBlock)locatedBlocks.getLastLocatedBlock();
|
||||||
|
DatanodeInfo[] storageInfos = lastBlock.getLocations();
|
||||||
|
if (storageInfos.length >= groupSize) {
|
||||||
|
return locatedBlocks;
|
||||||
|
}
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
throw new IOException("Time out waiting for EC block reconstruction.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the located blocks of a file.
|
||||||
|
*/
|
||||||
|
public static LocatedBlocks getLocatedBlocks(Path file,
|
||||||
|
DistributedFileSystem fs)
|
||||||
|
throws IOException {
|
||||||
|
return fs.getClient().getLocatedBlocks(file.toString(), 0, Long.MAX_VALUE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -269,7 +269,8 @@ public class TestReconstructStripedFile {
|
||||||
DFSTestUtil.writeFile(fs, file, data);
|
DFSTestUtil.writeFile(fs, file, data);
|
||||||
StripedFileTestUtil.waitBlockGroupsReported(fs, fileName);
|
StripedFileTestUtil.waitBlockGroupsReported(fs, fileName);
|
||||||
|
|
||||||
LocatedBlocks locatedBlocks = getLocatedBlocks(file);
|
LocatedBlocks locatedBlocks =
|
||||||
|
StripedFileTestUtil.getLocatedBlocks(file, fs);
|
||||||
assertEquals(locatedBlocks.getFileLength(), fileLen);
|
assertEquals(locatedBlocks.getFileLength(), fileLen);
|
||||||
|
|
||||||
LocatedStripedBlock lastBlock =
|
LocatedStripedBlock lastBlock =
|
||||||
|
@ -325,7 +326,7 @@ public class TestReconstructStripedFile {
|
||||||
int stoppedDN = generateErrors(errorMap, type);
|
int stoppedDN = generateErrors(errorMap, type);
|
||||||
|
|
||||||
// Check the locatedBlocks of the file again
|
// Check the locatedBlocks of the file again
|
||||||
locatedBlocks = getLocatedBlocks(file);
|
locatedBlocks = StripedFileTestUtil.getLocatedBlocks(file, fs);
|
||||||
lastBlock = (LocatedStripedBlock)locatedBlocks.getLastLocatedBlock();
|
lastBlock = (LocatedStripedBlock)locatedBlocks.getLastLocatedBlock();
|
||||||
storageInfos = lastBlock.getLocations();
|
storageInfos = lastBlock.getLocations();
|
||||||
assertEquals(storageInfos.length, groupSize - stoppedDN);
|
assertEquals(storageInfos.length, groupSize - stoppedDN);
|
||||||
|
@ -338,7 +339,7 @@ public class TestReconstructStripedFile {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
waitForReconstructionFinished(file, groupSize);
|
StripedFileTestUtil.waitForReconstructionFinished(file, fs, groupSize);
|
||||||
|
|
||||||
targetDNs = sortTargetsByReplicas(blocks, targetDNs);
|
targetDNs = sortTargetsByReplicas(blocks, targetDNs);
|
||||||
|
|
||||||
|
@ -381,26 +382,6 @@ public class TestReconstructStripedFile {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private LocatedBlocks waitForReconstructionFinished(Path file, int groupSize)
|
|
||||||
throws Exception {
|
|
||||||
final int ATTEMPTS = 60;
|
|
||||||
for (int i = 0; i < ATTEMPTS; i++) {
|
|
||||||
LocatedBlocks locatedBlocks = getLocatedBlocks(file);
|
|
||||||
LocatedStripedBlock lastBlock =
|
|
||||||
(LocatedStripedBlock)locatedBlocks.getLastLocatedBlock();
|
|
||||||
DatanodeInfo[] storageInfos = lastBlock.getLocations();
|
|
||||||
if (storageInfos.length >= groupSize) {
|
|
||||||
return locatedBlocks;
|
|
||||||
}
|
|
||||||
Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
throw new IOException ("Time out waiting for EC block reconstruction.");
|
|
||||||
}
|
|
||||||
|
|
||||||
private LocatedBlocks getLocatedBlocks(Path file) throws IOException {
|
|
||||||
return fs.getClient().getLocatedBlocks(file.toString(), 0, Long.MAX_VALUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Tests that processErasureCodingTasks should not throw exceptions out due to
|
* Tests that processErasureCodingTasks should not throw exceptions out due to
|
||||||
* invalid ECTask submission.
|
* invalid ECTask submission.
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.hdfs.StripedFileTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
|
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||||
|
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||||
|
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This file tests the erasure coding metrics in DataNode.
|
||||||
|
*/
|
||||||
|
public class TestDataNodeErasureCodingMetrics {
|
||||||
|
public static final Log LOG = LogFactory.
|
||||||
|
getLog(TestDataNodeErasureCodingMetrics.class);
|
||||||
|
|
||||||
|
private static final int DATA_BLK_NUM = StripedFileTestUtil.NUM_DATA_BLOCKS;
|
||||||
|
private static final int PARITY_BLK_NUM =
|
||||||
|
StripedFileTestUtil.NUM_PARITY_BLOCKS;
|
||||||
|
private static final int CELLSIZE =
|
||||||
|
StripedFileTestUtil.BLOCK_STRIPED_CELL_SIZE;
|
||||||
|
private static final int BLOCKSIZE = CELLSIZE;
|
||||||
|
private static final int GROUPSIZE = DATA_BLK_NUM + PARITY_BLK_NUM;
|
||||||
|
private static final int DN_NUM = GROUPSIZE + 1;
|
||||||
|
|
||||||
|
private MiniDFSCluster cluster;
|
||||||
|
private Configuration conf;
|
||||||
|
private DistributedFileSystem fs;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws IOException {
|
||||||
|
conf = new Configuration();
|
||||||
|
|
||||||
|
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCKSIZE);
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1);
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(DN_NUM).build();
|
||||||
|
cluster.waitActive();
|
||||||
|
cluster.getFileSystem().getClient().setErasureCodingPolicy("/", null);
|
||||||
|
fs = cluster.getFileSystem();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void tearDown() {
|
||||||
|
if (cluster != null) {
|
||||||
|
cluster.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(timeout = 120000)
|
||||||
|
public void testEcTasks() throws Exception {
|
||||||
|
DataNode workerDn = doTest("/testEcTasks");
|
||||||
|
MetricsRecordBuilder rb = getMetrics(workerDn.getMetrics().name());
|
||||||
|
assertCounter("EcReconstructionTasks", (long) 1, rb);
|
||||||
|
assertCounter("EcFailedReconstructionTasks", (long) 0, rb);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DataNode doTest(String fileName) throws Exception {
|
||||||
|
|
||||||
|
Path file = new Path(fileName);
|
||||||
|
long fileLen = DATA_BLK_NUM * BLOCKSIZE;
|
||||||
|
final byte[] data = StripedFileTestUtil.generateBytes((int) fileLen);
|
||||||
|
DFSTestUtil.writeFile(fs, file, data);
|
||||||
|
StripedFileTestUtil.waitBlockGroupsReported(fs, fileName);
|
||||||
|
|
||||||
|
LocatedBlocks locatedBlocks =
|
||||||
|
StripedFileTestUtil.getLocatedBlocks(file, fs);
|
||||||
|
//only one block group
|
||||||
|
LocatedStripedBlock lastBlock =
|
||||||
|
(LocatedStripedBlock)locatedBlocks.getLastLocatedBlock();
|
||||||
|
DataNode workerDn = null;
|
||||||
|
DatanodeInfo[] locations = lastBlock.getLocations();
|
||||||
|
assertEquals(locations.length, GROUPSIZE);
|
||||||
|
// we have ONE extra datanode in addition to the GROUPSIZE datanodes, here
|
||||||
|
// is to find the extra datanode that the reconstruction task will run on,
|
||||||
|
// according to the current block placement logic for striped files.
|
||||||
|
// This can be improved later to be flexible regardless wherever the task
|
||||||
|
// runs.
|
||||||
|
for (DataNode dn: cluster.getDataNodes()) {
|
||||||
|
boolean appear = false;
|
||||||
|
for (DatanodeInfo info: locations) {
|
||||||
|
if (dn.getDatanodeUuid().equals(info.getDatanodeUuid())) {
|
||||||
|
appear = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!appear) {
|
||||||
|
workerDn = dn;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
byte[] indices = lastBlock.getBlockIndices();
|
||||||
|
//corrupt the first block
|
||||||
|
DataNode toCorruptDn = cluster.getDataNodes().get(indices[0]);
|
||||||
|
toCorruptDn.shutdown();
|
||||||
|
setDataNodeDead(toCorruptDn.getDatanodeId());
|
||||||
|
DFSTestUtil.waitForDatanodeState(cluster, toCorruptDn.getDatanodeUuid(),
|
||||||
|
false, 10000 );
|
||||||
|
final BlockManager bm = cluster.getNamesystem().getBlockManager();
|
||||||
|
BlockManagerTestUtil.getComputedDatanodeWork(bm);
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
StripedFileTestUtil.waitForReconstructionFinished(file, fs, GROUPSIZE);
|
||||||
|
|
||||||
|
return workerDn;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setDataNodeDead(DatanodeID dnID) throws IOException {
|
||||||
|
DatanodeDescriptor dnd =
|
||||||
|
NameNodeAdapter.getDatanode(cluster.getNamesystem(), dnID);
|
||||||
|
DFSTestUtil.setDatanodeDead(dnd);
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(
|
||||||
|
cluster.getNamesystem().getBlockManager());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue