HDFS-8543. Erasure Coding: processOverReplicatedBlock() handles striped block. Contributed by Walter Su.

This commit is contained in:
Jing Zhao 2015-06-19 11:53:05 -07:00
parent 3682e01984
commit 448cb7df67
6 changed files with 255 additions and 34 deletions

View File

@ -311,3 +311,6 @@
HDFS-8254. Avoid assigning a leading streamer in StripedDataStreamer to
tolerate datanode failure. (Tsz Wo Nicholas Sze via jing9)
HDFS-8543. Erasure Coding: processOverReplicatedBlock() handles striped block.
(Walter Su via jing9)

View File

@ -22,6 +22,7 @@ import static org.apache.hadoop.util.ExitUtil.terminate;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
@ -3085,10 +3086,30 @@ public class BlockManager {
}
}
}
chooseExcessReplicates(nonExcess, block, replication,
addedNode, delNodeHint, placementPolicies.getPolicy(false));
chooseExcessReplicates(nonExcess, block, replication, addedNode,
delNodeHint);
}
private void chooseExcessReplicates(
final Collection<DatanodeStorageInfo> nonExcess,
BlockInfo storedBlock, short replication,
DatanodeDescriptor addedNode,
DatanodeDescriptor delNodeHint) {
assert namesystem.hasWriteLock();
// first form a rack to datanodes map and
BlockCollection bc = getBlockCollection(storedBlock);
final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(
bc.getStoragePolicyID());
final List<StorageType> excessTypes = storagePolicy.chooseExcess(
replication, DatanodeStorageInfo.toStorageTypes(nonExcess));
if (!storedBlock.isStriped()) {
chooseExcessReplicasContiguous(bc, nonExcess, storedBlock,
replication, addedNode, delNodeHint, excessTypes);
} else {
chooseExcessReplicasStriped(bc, nonExcess, storedBlock, delNodeHint,
excessTypes);
}
}
/**
* We want "replication" replicates for the block, but we now have too many.
@ -3104,20 +3125,13 @@ public class BlockManager {
* If no such a node is available,
* then pick a node with least free space
*/
private void chooseExcessReplicates(final Collection<DatanodeStorageInfo> nonExcess,
BlockInfo storedBlock, short replication,
DatanodeDescriptor addedNode,
DatanodeDescriptor delNodeHint,
BlockPlacementPolicy replicator) {
assert namesystem.hasWriteLock();
// first form a rack to datanodes map and
BlockCollection bc = getBlockCollection(storedBlock);
final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(
bc.getStoragePolicyID());
final List<StorageType> excessTypes = storagePolicy.chooseExcess(
replication, DatanodeStorageInfo.toStorageTypes(nonExcess));
private void chooseExcessReplicasContiguous(BlockCollection bc,
final Collection<DatanodeStorageInfo> nonExcess,
BlockInfo storedBlock, short replication,
DatanodeDescriptor addedNode,
DatanodeDescriptor delNodeHint,
List<StorageType> excessTypes) {
BlockPlacementPolicy replicator = placementPolicies.getPolicy(false);
final Map<String, List<DatanodeStorageInfo>> rackMap = new HashMap<>();
final List<DatanodeStorageInfo> moreThanOne = new ArrayList<>();
final List<DatanodeStorageInfo> exactlyOne = new ArrayList<>();
@ -3145,30 +3159,103 @@ public class BlockManager {
moreThanOne, exactlyOne, excessTypes);
}
firstOne = false;
// adjust rackmap, moreThanOne, and exactlyOne
replicator.adjustSetsWithChosenReplica(rackMap, moreThanOne,
exactlyOne, cur);
nonExcess.remove(cur);
addToExcessReplicate(cur.getDatanodeDescriptor(), storedBlock);
//
// The 'excessblocks' tracks blocks until we get confirmation
// that the datanode has deleted them; the only way we remove them
// is when we get a "removeBlock" message.
//
// The 'invalidate' list is used to inform the datanode the block
// should be deleted. Items are removed from the invalidate list
// upon giving instructions to the datanodes.
//
final Block blockToInvalidate = getBlockToInvalidate(storedBlock, cur);
addToInvalidates(blockToInvalidate, cur.getDatanodeDescriptor());
blockLog.info("BLOCK* chooseExcessReplicates: "
+"({}, {}) is added to invalidated blocks set", cur, storedBlock);
processChosenExcessReplica(nonExcess, cur, storedBlock);
}
}
/**
* We want block group has every internal block, but we have redundant
* internal blocks (which have the same index).
* In this method, we delete the redundant internal blocks until only one
* left for each index.
*
* The block placement policy will make sure that the left internal blocks are
* spread across racks and also try hard to pick one with least free space.
*/
private void chooseExcessReplicasStriped(BlockCollection bc,
final Collection<DatanodeStorageInfo> nonExcess,
BlockInfo storedBlock,
DatanodeDescriptor delNodeHint,
List<StorageType> excessTypes) {
assert storedBlock instanceof BlockInfoStriped;
BlockInfoStriped sblk = (BlockInfoStriped) storedBlock;
short groupSize = sblk.getTotalBlockNum();
if (nonExcess.size() <= groupSize) {
return;
}
BlockPlacementPolicy placementPolicy = placementPolicies.getPolicy(true);
List<DatanodeStorageInfo> empty = new ArrayList<>(0);
// find all duplicated indices
BitSet found = new BitSet(groupSize); //indices found
BitSet duplicated = new BitSet(groupSize); //indices found more than once
HashMap<DatanodeStorageInfo, Integer> storage2index = new HashMap<>();
for (DatanodeStorageInfo storage : nonExcess) {
int index = sblk.getStorageBlockIndex(storage);
assert index >= 0;
if (found.get(index)) {
duplicated.set(index);
}
found.set(index);
storage2index.put(storage, index);
}
// use delHint only if delHint is duplicated
final DatanodeStorageInfo delStorageHint =
DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, delNodeHint);
if (delStorageHint != null) {
Integer index = storage2index.get(delStorageHint);
if (index != null && duplicated.get(index)) {
processChosenExcessReplica(nonExcess, delStorageHint, storedBlock);
}
}
// for each duplicated index, delete some replicas until only one left
for (int targetIndex = duplicated.nextSetBit(0); targetIndex >= 0;
targetIndex = duplicated.nextSetBit(targetIndex + 1)) {
List<DatanodeStorageInfo> candidates = new ArrayList<>();
for (DatanodeStorageInfo storage : nonExcess) {
int index = storage2index.get(storage);
if (index == targetIndex) {
candidates.add(storage);
}
}
Block internalBlock = new Block(storedBlock);
internalBlock.setBlockId(storedBlock.getBlockId() + targetIndex);
while (candidates.size() > 1) {
DatanodeStorageInfo target = placementPolicy.chooseReplicaToDelete(bc,
internalBlock, (short)1, candidates, empty, excessTypes);
processChosenExcessReplica(nonExcess, target, storedBlock);
candidates.remove(target);
}
duplicated.clear(targetIndex);
}
}
private void processChosenExcessReplica(
final Collection<DatanodeStorageInfo> nonExcess,
final DatanodeStorageInfo chosen, BlockInfo storedBlock) {
nonExcess.remove(chosen);
addToExcessReplicate(chosen.getDatanodeDescriptor(), storedBlock);
//
// The 'excessblocks' tracks blocks until we get confirmation
// that the datanode has deleted them; the only way we remove them
// is when we get a "removeBlock" message.
//
// The 'invalidate' list is used to inform the datanode the block
// should be deleted. Items are removed from the invalidate list
// upon giving instructions to the datanodes.
//
final Block blockToInvalidate = getBlockToInvalidate(storedBlock, chosen);
addToInvalidates(blockToInvalidate, chosen.getDatanodeDescriptor());
blockLog.info("BLOCK* chooseExcessReplicates: "
+"({}, {}) is added to invalidated blocks set", chosen, storedBlock);
}
/** Check if we can use delHint */
static boolean useDelHint(boolean isFirst, DatanodeStorageInfo delHint,
DatanodeStorageInfo added, List<DatanodeStorageInfo> moreThan1Racks,

View File

@ -110,6 +110,7 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
@ -1968,17 +1969,29 @@ public class DFSTestUtil {
}
/**
* Verify that blocks in striped block group are on different nodes.
* Verify that blocks in striped block group are on different nodes, and every
* internal blocks exists.
*/
public static void verifyLocatedStripedBlocks(LocatedBlocks lbs,
int groupSize) {
for (LocatedBlock lb : lbs.getLocatedBlocks()) {
assert lb instanceof LocatedStripedBlock;
HashSet<DatanodeInfo> locs = new HashSet<>();
for (DatanodeInfo datanodeInfo : lb.getLocations()) {
locs.add(datanodeInfo);
}
assertEquals(groupSize, lb.getLocations().length);
assertEquals(groupSize, locs.size());
// verify that every internal blocks exists
int[] blockIndices = ((LocatedStripedBlock) lb).getBlockIndices();
assertEquals(groupSize, blockIndices.length);
HashSet<Integer> found = new HashSet<>();
for (int index : blockIndices) {
assert index >=0;
found.add(index);
}
assertEquals(groupSize, found.size());
}
}
}

View File

@ -143,6 +143,7 @@ public class TestBalancer {
static void initConfWithStripe(Configuration conf) {
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DEFAULT_STRIPE_BLOCK_SIZE);
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, false);
conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1L);
SimulatedFSDataset.setFactory(conf);
conf.setLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1L);

View File

@ -379,6 +379,7 @@ public class TestMover {
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DEFAULT_STRIPE_BLOCK_SIZE);
conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1L);
conf.setLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1L);
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, false);
Dispatcher.setBlockMoveWaitTime(3000L);
}

View File

@ -0,0 +1,116 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestAddOverReplicatedStripedBlocks {
private MiniDFSCluster cluster;
private DistributedFileSystem fs;
private final Path dirPath = new Path("/striped");
private Path filePath = new Path(dirPath, "file");
private final short DATA_BLK_NUM = HdfsConstants.NUM_DATA_BLOCKS;
private final short PARITY_BLK_NUM = HdfsConstants.NUM_PARITY_BLOCKS;
private final short GROUP_SIZE = DATA_BLK_NUM + PARITY_BLK_NUM;
private final int CELLSIZE = HdfsConstants.BLOCK_STRIPED_CELL_SIZE;
private final int NUM_STRIPE_PER_BLOCK = 1;
private final int BLOCK_SIZE = NUM_STRIPE_PER_BLOCK * CELLSIZE;
private final int numDNs = GROUP_SIZE + 3;
@Before
public void setup() throws IOException {
Configuration conf = new Configuration();
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
SimulatedFSDataset.setFactory(conf);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDNs).build();
cluster.waitActive();
fs = cluster.getFileSystem();
fs.mkdirs(dirPath);
fs.getClient().createErasureCodingZone(dirPath.toString(), null, CELLSIZE);
}
@After
public void tearDown() {
if (cluster != null) {
cluster.shutdown();
}
}
@Test
public void testProcessOverReplicatedStripedBlock() throws Exception {
// create a file which has exact one block group to the first GROUP_SIZE DNs
long fileLen = DATA_BLK_NUM * BLOCK_SIZE;
DFSTestUtil.createStripedFile(cluster, filePath, null, 1,
NUM_STRIPE_PER_BLOCK, false);
LocatedBlocks lbs = cluster.getNameNodeRpc().getBlockLocations(
filePath.toString(), 0, fileLen);
LocatedStripedBlock bg = (LocatedStripedBlock) (lbs.get(0));
long gs = bg.getBlock().getGenerationStamp();
String bpid = bg.getBlock().getBlockPoolId();
long groupId = bg.getBlock().getBlockId();
Block blk = new Block(groupId, BLOCK_SIZE, gs);
for (int i = 0; i < GROUP_SIZE; i++) {
blk.setBlockId(groupId + i);
cluster.injectBlocks(i, Arrays.asList(blk), bpid);
}
cluster.triggerBlockReports();
// let a internal block be over replicated with 2 redundant blocks.
blk.setBlockId(groupId + 2);
cluster.injectBlocks(numDNs - 3, Arrays.asList(blk), bpid);
cluster.injectBlocks(numDNs - 2, Arrays.asList(blk), bpid);
// let a internal block be over replicated with 1 redundant block.
blk.setBlockId(groupId + 6);
cluster.injectBlocks(numDNs - 1, Arrays.asList(blk), bpid);
// update blocksMap
cluster.triggerBlockReports();
// add to invalidates
cluster.triggerHeartbeats();
// datanode delete block
cluster.triggerHeartbeats();
// update blocksMap
cluster.triggerBlockReports();
// verify that all internal blocks exists
lbs = cluster.getNameNodeRpc().getBlockLocations(
filePath.toString(), 0, fileLen);
DFSTestUtil.verifyLocatedStripedBlocks(lbs, GROUP_SIZE);
}
}